From 8a1b2725a60d3267135c15e80984b4406054f650 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Thu, 10 Dec 2015 09:59:25 +0200
Subject: usb: define USB_SPEED_SUPER_PLUS speed for SuperSpeedPlus USB3.1
 devices

Add a new USB_SPEED_SUPER_PLUS device speed, and make sure usb core can
handle the new speed.
In most cases the behaviour is the same as with USB_SPEED_SUPER SuperSpeed
devices. In a few places we add a "Plus" string to inform the user of the
new speed.

Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/usb/ch9.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index 4338eb7b09b3..779a62aafafe 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -954,6 +954,7 @@ enum usb_device_speed {
 	USB_SPEED_HIGH,				/* usb 2.0 */
 	USB_SPEED_WIRELESS,			/* wireless (usb 2.5) */
 	USB_SPEED_SUPER,			/* usb 3.0 */
+	USB_SPEED_SUPER_PLUS,			/* usb 3.1 */
 };
 
 
-- 
cgit v1.2.3


From 0cdd49a1d1a483d80170d9e592f832274e8bce1b Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Thu, 10 Dec 2015 09:59:29 +0200
Subject: usb: Support USB 3.1 extended port status request

usb 3.1 extend the hub get-port-status request by adding different
request types. the new request types return 4 additional bytes called
extended port status, these bytes are returned after the regular
portstatus and portchange values.

The extended port status contains a speed ID for the currently used
sublink speed. A table of supported Speed IDs with details about the link
is provided by the hub in the device descriptor BOS SuperSpeedPlus
device capability Sublink Speed Attributes.

Support this new request. Ask for the extended port status after port
reset if hub supports USB 3.1. If link is running at SuperSpeedPlus
set the device speed to USB_SPEED_SUPER_PLUS

Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/hcd.c        |  8 ++++-
 drivers/usb/core/hub.c        | 70 +++++++++++++++++++++++++++++++++++++------
 drivers/usb/core/hub.h        |  7 +++++
 include/uapi/linux/usb/ch11.h | 21 +++++++++++++
 4 files changed, 96 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index bca13a0e0326..9af9506352f3 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -668,9 +668,15 @@ nongeneric:
 		/* non-generic request */
 		switch (typeReq) {
 		case GetHubStatus:
-		case GetPortStatus:
 			len = 4;
 			break;
+		case GetPortStatus:
+			if (wValue == HUB_PORT_STATUS)
+				len = 4;
+			else
+				/* other port status types return 8 bytes */
+				len = 8;
+			break;
 		case GetHubDescriptor:
 			len = sizeof (struct usb_hub_descriptor);
 			break;
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 6bdff0ad3930..3c9b22eb78ca 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -537,29 +537,34 @@ static int get_hub_status(struct usb_device *hdev,
 
 /*
  * USB 2.0 spec Section 11.24.2.7
+ * USB 3.1 takes into use the wValue and wLength fields, spec Section 10.16.2.6
  */
 static int get_port_status(struct usb_device *hdev, int port1,
-		struct usb_port_status *data)
+			   void *data, u16 value, u16 length)
 {
 	int i, status = -ETIMEDOUT;
 
 	for (i = 0; i < USB_STS_RETRIES &&
 			(status == -ETIMEDOUT || status == -EPIPE); i++) {
 		status = usb_control_msg(hdev, usb_rcvctrlpipe(hdev, 0),
-			USB_REQ_GET_STATUS, USB_DIR_IN | USB_RT_PORT, 0, port1,
-			data, sizeof(*data), USB_STS_TIMEOUT);
+			USB_REQ_GET_STATUS, USB_DIR_IN | USB_RT_PORT, value,
+			port1, data, length, USB_STS_TIMEOUT);
 	}
 	return status;
 }
 
-static int hub_port_status(struct usb_hub *hub, int port1,
-		u16 *status, u16 *change)
+static int hub_ext_port_status(struct usb_hub *hub, int port1, int type,
+			       u16 *status, u16 *change, u32 *ext_status)
 {
 	int ret;
+	int len = 4;
+
+	if (type != HUB_PORT_STATUS)
+		len = 8;
 
 	mutex_lock(&hub->status_mutex);
-	ret = get_port_status(hub->hdev, port1, &hub->status->port);
-	if (ret < 4) {
+	ret = get_port_status(hub->hdev, port1, &hub->status->port, type, len);
+	if (ret < len) {
 		if (ret != -ENODEV)
 			dev_err(hub->intfdev,
 				"%s failed (err = %d)\n", __func__, ret);
@@ -568,13 +573,22 @@ static int hub_port_status(struct usb_hub *hub, int port1,
 	} else {
 		*status = le16_to_cpu(hub->status->port.wPortStatus);
 		*change = le16_to_cpu(hub->status->port.wPortChange);
-
+		if (type != HUB_PORT_STATUS && ext_status)
+			*ext_status = le32_to_cpu(
+				hub->status->port.dwExtPortStatus);
 		ret = 0;
 	}
 	mutex_unlock(&hub->status_mutex);
 	return ret;
 }
 
+static int hub_port_status(struct usb_hub *hub, int port1,
+		u16 *status, u16 *change)
+{
+	return hub_ext_port_status(hub, port1, HUB_PORT_STATUS,
+				   status, change, NULL);
+}
+
 static void kick_hub_wq(struct usb_hub *hub)
 {
 	struct usb_interface *intf;
@@ -2612,6 +2626,32 @@ out_authorized:
 	return result;
 }
 
+/*
+ * Return 1 if port speed is SuperSpeedPlus, 0 otherwise
+ * check it from the link protocol field of the current speed ID attribute.
+ * current speed ID is got from ext port status request. Sublink speed attribute
+ * table is returned with the hub BOS SSP device capability descriptor
+ */
+static int port_speed_is_ssp(struct usb_device *hdev, int speed_id)
+{
+	int ssa_count;
+	u32 ss_attr;
+	int i;
+	struct usb_ssp_cap_descriptor *ssp_cap = hdev->bos->ssp_cap;
+
+	if (!ssp_cap)
+		return 0;
+
+	ssa_count = le32_to_cpu(ssp_cap->bmAttributes) &
+		USB_SSP_SUBLINK_SPEED_ATTRIBS;
+
+	for (i = 0; i <= ssa_count; i++) {
+		ss_attr = le32_to_cpu(ssp_cap->bmSublinkSpeedAttr[i]);
+		if (speed_id == (ss_attr & USB_SSP_SUBLINK_SPEED_SSID))
+			return !!(ss_attr & USB_SSP_SUBLINK_SPEED_LP);
+	}
+	return 0;
+}
 
 /* Returns 1 if @hub is a WUSB root hub, 0 otherwise */
 static unsigned hub_is_wusb(struct usb_hub *hub)
@@ -2676,6 +2716,7 @@ static int hub_port_wait_reset(struct usb_hub *hub, int port1,
 	int delay_time, ret;
 	u16 portstatus;
 	u16 portchange;
+	u32 ext_portstatus = 0;
 
 	for (delay_time = 0;
 			delay_time < HUB_RESET_TIMEOUT;
@@ -2684,7 +2725,14 @@ static int hub_port_wait_reset(struct usb_hub *hub, int port1,
 		msleep(delay);
 
 		/* read and decode port status */
-		ret = hub_port_status(hub, port1, &portstatus, &portchange);
+		if (hub_is_superspeedplus(hub->hdev))
+			ret = hub_ext_port_status(hub, port1,
+						  HUB_EXT_PORT_STATUS,
+						  &portstatus, &portchange,
+						  &ext_portstatus);
+		else
+			ret = hub_port_status(hub, port1, &portstatus,
+					      &portchange);
 		if (ret < 0)
 			return ret;
 
@@ -2727,6 +2775,10 @@ static int hub_port_wait_reset(struct usb_hub *hub, int port1,
 
 	if (hub_is_wusb(hub))
 		udev->speed = USB_SPEED_WIRELESS;
+	else if (hub_is_superspeedplus(hub->hdev) &&
+		 port_speed_is_ssp(hub->hdev, ext_portstatus &
+				   USB_EXT_PORT_STAT_RX_SPEED_ID))
+		udev->speed = USB_SPEED_SUPER_PLUS;
 	else if (hub_is_superspeed(hub->hdev))
 		udev->speed = USB_SPEED_SUPER;
 	else if (portstatus & USB_PORT_STAT_HIGH_SPEED)
diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h
index 45d070dd1d03..34c1a7e22aae 100644
--- a/drivers/usb/core/hub.h
+++ b/drivers/usb/core/hub.h
@@ -140,6 +140,13 @@ static inline int hub_is_superspeed(struct usb_device *hdev)
 	return hdev->descriptor.bDeviceProtocol == USB_HUB_PR_SS;
 }
 
+static inline int hub_is_superspeedplus(struct usb_device *hdev)
+{
+	return (hdev->descriptor.bDeviceProtocol == USB_HUB_PR_SS &&
+		le16_to_cpu(hdev->descriptor.bcdUSB) >= 0x0310 &&
+		hdev->bos->ssp_cap);
+}
+
 static inline unsigned hub_power_on_good_delay(struct usb_hub *hub)
 {
 	unsigned delay = hub->descriptor->bPwrOn2PwrGood * 2;
diff --git a/include/uapi/linux/usb/ch11.h b/include/uapi/linux/usb/ch11.h
index 331499d597fa..361297e96f58 100644
--- a/include/uapi/linux/usb/ch11.h
+++ b/include/uapi/linux/usb/ch11.h
@@ -29,6 +29,14 @@
 #define USB_RT_HUB	(USB_TYPE_CLASS | USB_RECIP_DEVICE)
 #define USB_RT_PORT	(USB_TYPE_CLASS | USB_RECIP_OTHER)
 
+/*
+ * Port status type for GetPortStatus requests added in USB 3.1
+ * See USB 3.1 spec Table 10-12
+ */
+#define HUB_PORT_STATUS		0
+#define HUB_PORT_PD_STATUS	1
+#define HUB_EXT_PORT_STATUS	2
+
 /*
  * Hub class requests
  * See USB 2.0 spec Table 11-16
@@ -97,10 +105,13 @@
 /*
  * Hub Status and Hub Change results
  * See USB 2.0 spec Table 11-19 and Table 11-20
+ * USB 3.1 extends the port status request and may return 4 additional bytes.
+ * See USB 3.1 spec section 10.16.2.6 Table 10-12 and 10-15
  */
 struct usb_port_status {
 	__le16 wPortStatus;
 	__le16 wPortChange;
+	__le32 dwExtPortStatus;
 } __attribute__ ((packed));
 
 /*
@@ -172,6 +183,16 @@ struct usb_port_status {
 #define USB_PORT_STAT_C_LINK_STATE	0x0040
 #define USB_PORT_STAT_C_CONFIG_ERROR	0x0080
 
+/*
+ * USB 3.1 dwExtPortStatus field masks
+ * See USB 3.1 spec 10.16.2.6.3 Table 10-15
+ */
+
+#define USB_EXT_PORT_STAT_RX_SPEED_ID	0x0000000f
+#define USB_EXT_PORT_STAT_TX_SPEED_ID	0x000000f0
+#define USB_EXT_PORT_STAT_RX_LANES	0x00000f00
+#define USB_EXT_PORT_STAT_TX_LANES	0x0000f000
+
 /*
  * wHubCharacteristics (masks)
  * See USB 2.0 spec Table 11-13, offset 3
-- 
cgit v1.2.3


From 333a6515344f5dc4ed0772b8fe252a2246f2e099 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Date: Mon, 25 Jan 2016 07:29:13 -0200
Subject: Revert "[media] Postpone the addition of MEDIA_IOC_G_TOPOLOGY"

Enable MEDIA_IOC_G_TOPOLOGY ioctl for Kernel 4.6.

This reverts commit be0270ec89e6b9b49de7e533dd1f3a89ad34d205.

Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 Documentation/DocBook/media/v4l/media-ioc-g-topology.xml | 3 ---
 drivers/media/media-device.c                             | 7 +------
 include/uapi/linux/media.h                               | 6 +-----
 3 files changed, 2 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/DocBook/media/v4l/media-ioc-g-topology.xml b/Documentation/DocBook/media/v4l/media-ioc-g-topology.xml
index 63152ab9efba..e0d49fa329f0 100644
--- a/Documentation/DocBook/media/v4l/media-ioc-g-topology.xml
+++ b/Documentation/DocBook/media/v4l/media-ioc-g-topology.xml
@@ -48,9 +48,6 @@
 
   <refsect1>
     <title>Description</title>
-
-    <para><emphasis role="bold">NOTE:</emphasis> This new ioctl is programmed to be added on Kernel 4.6. Its definition/arguments may change until its final version.</para>
-
     <para>The typical usage of this ioctl is to call it twice.
     On the first call, the structure defined at &media-v2-topology; should
     be zeroed. At return, if no errors happen, this ioctl will return the
diff --git a/drivers/media/media-device.c b/drivers/media/media-device.c
index 7dae0ac0f3ae..4d1c13de494b 100644
--- a/drivers/media/media-device.c
+++ b/drivers/media/media-device.c
@@ -234,7 +234,6 @@ static long media_device_setup_link(struct media_device *mdev,
 	return ret;
 }
 
-#if 0 /* Let's postpone it to Kernel 4.6 */
 static long __media_device_get_topology(struct media_device *mdev,
 				      struct media_v2_topology *topo)
 {
@@ -390,7 +389,6 @@ static long media_device_get_topology(struct media_device *mdev,
 
 	return 0;
 }
-#endif
 
 static long media_device_ioctl(struct file *filp, unsigned int cmd,
 			       unsigned long arg)
@@ -424,14 +422,13 @@ static long media_device_ioctl(struct file *filp, unsigned int cmd,
 		mutex_unlock(&dev->graph_mutex);
 		break;
 
-#if 0 /* Let's postpone it to Kernel 4.6 */
 	case MEDIA_IOC_G_TOPOLOGY:
 		mutex_lock(&dev->graph_mutex);
 		ret = media_device_get_topology(dev,
 				(struct media_v2_topology __user *)arg);
 		mutex_unlock(&dev->graph_mutex);
 		break;
-#endif
+
 	default:
 		ret = -ENOIOCTLCMD;
 	}
@@ -480,9 +477,7 @@ static long media_device_compat_ioctl(struct file *filp, unsigned int cmd,
 	case MEDIA_IOC_DEVICE_INFO:
 	case MEDIA_IOC_ENUM_ENTITIES:
 	case MEDIA_IOC_SETUP_LINK:
-#if 0 /* Let's postpone it to Kernel 4.6 */
 	case MEDIA_IOC_G_TOPOLOGY:
-#endif
 		return media_device_ioctl(filp, cmd, arg);
 
 	case MEDIA_IOC_ENUM_LINKS32:
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 1e3c8cb43bd7..5dbb208e5451 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -286,7 +286,7 @@ struct media_links_enum {
  *	  later, before the adding this API upstream.
  */
 
-#if 0 /* Let's postpone it to Kernel 4.6 */
+
 struct media_v2_entity {
 	__u32 id;
 	char name[64];		/* FIXME: move to a property? (RFC says so) */
@@ -351,7 +351,6 @@ static inline void __user *media_get_uptr(__u64 arg)
 {
 	return (void __user *)(uintptr_t)arg;
 }
-#endif
 
 /* ioctls */
 
@@ -359,9 +358,6 @@ static inline void __user *media_get_uptr(__u64 arg)
 #define MEDIA_IOC_ENUM_ENTITIES		_IOWR('|', 0x01, struct media_entity_desc)
 #define MEDIA_IOC_ENUM_LINKS		_IOWR('|', 0x02, struct media_links_enum)
 #define MEDIA_IOC_SETUP_LINK		_IOWR('|', 0x03, struct media_link_desc)
-
-#if 0 /* Let's postpone it to Kernel 4.6 */
 #define MEDIA_IOC_G_TOPOLOGY		_IOWR('|', 0x04, struct media_v2_topology)
-#endif
 
 #endif /* __LINUX_MEDIA_H */
-- 
cgit v1.2.3


From 133e1e5acd4a63c4a0dcc413e90d5decdbce9c4a Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Mon, 25 Jan 2016 18:04:15 -0500
Subject: audit: stop an old auditd being starved out by a new auditd

Nothing prevents a new auditd starting up and replacing a valid
audit_pid when an old auditd is still running, effectively starving out
the old auditd since audit_pid no longer points to the old valid
auditd.

If no message to auditd has been attempted since auditd died
unnaturally or got killed, audit_pid will still indicate it is alive.
There isn't an easy way to detect if an old auditd is still running on
the existing audit_pid other than attempting to send a message to see
if it fails.  An -ECONNREFUSED almost certainly means it disappeared
and can be replaced.  Other errors are not so straightforward and may
indicate transient problems that will resolve themselves and the old
auditd will recover.  Yet others will likely need manual intervention
for which a new auditd will not solve the problem.

Send a new message type (AUDIT_REPLACE) to the old auditd containing a
u32 with the PID of the new auditd.  If the audit replace message
succeeds (or doesn't fail with certainty), fail to register the new
auditd and return an error (-EEXIST).

This is expected to make the patch preventing an old auditd orphaning a
new auditd redundant.

V3: Switch audit message type from 1000 to 1300 block.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Paul Moore <pmoore@redhat.com>
---
 include/uapi/linux/audit.h |  1 +
 kernel/audit.c             | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 843540c398eb..d820aa979620 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -110,6 +110,7 @@
 #define AUDIT_SECCOMP		1326	/* Secure Computing event */
 #define AUDIT_PROCTITLE		1327	/* Proctitle emit event */
 #define AUDIT_FEATURE_CHANGE	1328	/* audit log listing feature changes */
+#define AUDIT_REPLACE		1329	/* Replace auditd if this packet unanswerd */
 
 #define AUDIT_AVC		1400	/* SE Linux avc denial or grant */
 #define AUDIT_SELINUX_ERR	1401	/* Internal SE Linux Errors */
diff --git a/kernel/audit.c b/kernel/audit.c
index d6dd95cc59e6..2fd63d6879c5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -809,6 +809,16 @@ static int audit_set_feature(struct sk_buff *skb)
 	return 0;
 }
 
+static int audit_replace(pid_t pid)
+{
+	struct sk_buff *skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0,
+					       &pid, sizeof(pid));
+
+	if (!skb)
+		return -ENOMEM;
+	return netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
+}
+
 static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
 	u32			seq;
@@ -870,9 +880,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		}
 		if (s.mask & AUDIT_STATUS_PID) {
 			int new_pid = s.pid;
+			pid_t requesting_pid = task_tgid_vnr(current);
 
-			if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
+			if ((!new_pid) && (requesting_pid != audit_pid))
 				return -EACCES;
+			if (audit_pid && new_pid &&
+			    audit_replace(requesting_pid) != -ECONNREFUSED)
+				return -EEXIST;
 			if (audit_enabled != AUDIT_OFF)
 				audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
 			audit_pid = new_pid;
-- 
cgit v1.2.3


From ecb3a7ccc653fc8007edd0591d32d839386673f1 Mon Sep 17 00:00:00 2001
From: Matt Ranostay <mranostay@gmail.com>
Date: Tue, 26 Jan 2016 18:34:30 -0800
Subject: iio: ph: add IIO_PH channel type

Signed-off-by: Matt Ranostay <mranostay@gmail.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 Documentation/ABI/testing/sysfs-bus-iio | 7 +++++++
 drivers/iio/industrialio-core.c         | 1 +
 include/uapi/linux/iio/types.h          | 1 +
 3 files changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio
index 8fadd272ad8a..80c6fce9935b 100644
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1493,3 +1493,10 @@ Description:
 		This ABI is especially applicable for humidity sensors
 		to heatup the device and get rid of any condensation
 		in some humidity environment
+
+What:		/sys/bus/iio/devices/iio:deviceX/in_ph_raw
+KernelVersion:	4.5
+Contact:	linux-iio@vger.kernel.org
+Description:
+		Raw (unscaled no offset etc.) pH reading of a substance as a negative
+		base-10 logarithm of hydrodium ions in a litre of water.
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index fd01f3493fc7..eb01a83a5b7c 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -77,6 +77,7 @@ static const char * const iio_chan_type_name_spec[] = {
 	[IIO_VELOCITY] = "velocity",
 	[IIO_CONCENTRATION] = "concentration",
 	[IIO_RESISTANCE] = "resistance",
+	[IIO_PH] = "ph",
 };
 
 static const char * const iio_modifier_names[] = {
diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h
index 7c63bd67c36e..c077617f3304 100644
--- a/include/uapi/linux/iio/types.h
+++ b/include/uapi/linux/iio/types.h
@@ -37,6 +37,7 @@ enum iio_chan_type {
 	IIO_VELOCITY,
 	IIO_CONCENTRATION,
 	IIO_RESISTANCE,
+	IIO_PH,
 };
 
 enum iio_modifier {
-- 
cgit v1.2.3


From 06131932c1d5f1bd09832f42be658906a27749fb Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Date: Thu, 28 Jan 2016 07:28:08 -0200
Subject: [media] media.h: add support for IF-PLL video/sound decoder

Very old hardware may have an analog stage tuner. Those hardware
consists of a PLL that converts a RF signal into IF signals.

Depending on the hardware, those video IF signal can be
decoded directly by the bridge chipset. Most Conexant
chips (bt8x8, cx2388x, etc) have internally the decoders
for that. Yet, even on such hardware, the tuner may have
internally its own TV multi-standard decoder like tda9887.

The same happens with the audio IF signal, where some bridges
are capable of receiving it, while others require an external
IF-PLL sound decoder, like msp3400.

Those external IF-PLL audio and video decoders have their own
I2C address, and use different drivers to handle them. So, they're
mapped as different subdevices on Linux.

Thankfully, all modern hardware comes with an IC chip that
has both the RF and the IF stages on it, being capable of
decoding audio and video IF signals internally.

Yet, as we need to support drivers that can work with either
analog or silicon tuners, we need to add two entity types
for those old hardware.

Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 Documentation/DocBook/media/v4l/media-types.xml | 29 ++++++++++++++++++++++++-
 include/uapi/linux/media.h                      | 17 +++++++++++++--
 2 files changed, 43 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/DocBook/media/v4l/media-types.xml b/Documentation/DocBook/media/v4l/media-types.xml
index 1af384250910..751c3d027103 100644
--- a/Documentation/DocBook/media/v4l/media-types.xml
+++ b/Documentation/DocBook/media/v4l/media-types.xml
@@ -84,7 +84,34 @@
 	  </row>
 	  <row>
 	    <entry><constant>MEDIA_ENT_F_TUNER</constant></entry>
-	    <entry>Digital TV, analog TV, radio and/or software radio tuner.</entry>
+	    <entry>Digital TV, analog TV, radio and/or software radio tuner,
+		   with consists on a PLL tuning stage that converts radio
+		   frequency (RF) signal into an Intermediate Frequency (IF).
+		   Modern tuners have internally IF-PLL decoders for audio
+		   and video, but older models have those stages implemented
+		   on separate entities.
+	    </entry>
+	  </row>
+	  <row>
+	    <entry><constant>MEDIA_ENT_F_IF_VID_DECODER</constant></entry>
+	    <entry>IF-PLL video decoder. It receives the IF from a PLL
+		   and decodes the analog TV video signal. This is commonly
+		   found on some very old analog tuners, like Philips MK3
+		   designs. They all contain a tda9887 (or some software
+		   compatible similar chip, like tda9885). Those devices
+		   use a different I2C address than the tuner PLL.
+	    </entry>
+	  </row>
+	  <row>
+	    <entry><constant>MEDIA_ENT_F_IF_AUD_DECODER</constant></entry>
+	    <entry>IF-PLL sound decoder. It receives the IF from a PLL
+		   and decodes the analog TV audio signal. This is commonly
+		   found on some very old analog hardware, like Micronas
+		   msp3400, Philips tda9840, tda985x, etc. Those devices
+		   use a different I2C address than the tuner PLL and
+		   should be controlled together with the IF-PLL video
+		   decoder.
+	    </entry>
 	  </row>
 	</tbody>
       </tgroup>
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 5dbb208e5451..c9eb42a6c021 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -88,6 +88,15 @@ struct media_device_info {
 #define MEDIA_ENT_F_IO_VBI  		(MEDIA_ENT_F_BASE + 32)
 #define MEDIA_ENT_F_IO_SWRADIO		(MEDIA_ENT_F_BASE + 33)
 
+/*
+ * Analog TV IF-PLL decoders
+ *
+ * It is a responsibility of the master/bridge drivers to create links
+ * for MEDIA_ENT_F_IF_VID_DECODER and MEDIA_ENT_F_IF_AUD_DECODER.
+ */
+#define MEDIA_ENT_F_IF_VID_DECODER	(MEDIA_ENT_F_BASE + 41)
+#define MEDIA_ENT_F_IF_AUD_DECODER	(MEDIA_ENT_F_BASE + 42)
+
 /*
  * Don't touch on those. The ranges MEDIA_ENT_F_OLD_BASE and
  * MEDIA_ENT_F_OLD_SUBDEV_BASE are kept to keep backward compatibility
@@ -107,8 +116,12 @@ struct media_device_info {
 #define MEDIA_ENT_F_LENS		(MEDIA_ENT_F_OLD_SUBDEV_BASE + 3)
 #define MEDIA_ENT_F_ATV_DECODER		(MEDIA_ENT_F_OLD_SUBDEV_BASE + 4)
 /*
- * It is a responsibility of the entity drivers to add connectors and links
- *	for the tuner entities.
+ * It is a responsibility of the master/bridge drivers to add connectors
+ * and links for MEDIA_ENT_F_TUNER. Please notice that some old tuners
+ * may require the usage of separate I2C chips to decode analog TV signals,
+ * when the master/bridge chipset doesn't have its own TV standard decoder.
+ * On such cases, the IF-PLL staging is mapped via one or two entities:
+ * MEDIA_ENT_F_IF_VID_DECODER and/or MEDIA_ENT_F_IF_AUD_DECODER.
  */
 #define MEDIA_ENT_F_TUNER		(MEDIA_ENT_F_OLD_SUBDEV_BASE + 5)
 
-- 
cgit v1.2.3


From 80100fd9ebb9f2414892a1178d26a4253e6c0bcf Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@linux.intel.com>
Date: Mon, 11 Jan 2016 11:21:15 -0200
Subject: [media] media: v4l: Dual license v4l2-common.h under GPL v2 and BSD
 licenses

The v4l2-common.h user space header was split off from videodev2.h, but
the dual licensing of the videodev2.h (as well as other V4L2 headers) was
missed. Change the license of the v4l2-common.h from GNU GPL v2 to both
GNU GPL v2 and BSD.

Sakari Ailus <sakari.ailus@iki.fi>:
> Would you approve a license change of the patches to
> include/uapi/linux/v4l2-common.h (formerly include/linux/v4l2-common.h) you
> or your company have contributed from GNU GPL v2 to dual GNU GPL v2 and BSD
> licenses, changing the copyright notice in the file as below (from
> videodev2.h):
>
> -------------8<------------
>   *  This program is free software; you can redistribute it and/or modify
>   *  it under the terms of the GNU General Public License as published by
>   *  the Free Software Foundation; either version 2 of the License, or
>   *  (at your option) any later version.
>   *
>   *  This program is distributed in the hope that it will be useful,
>   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
>   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>   *  GNU General Public License for more details.
>   *
>   *  Alternatively you can redistribute this file under the terms of the
>   *  BSD license as stated below:
>   *
>   *  Redistribution and use in source and binary forms, with or without
>   *  modification, are permitted provided that the following conditions
>   *  are met:
>   *  1. Redistributions of source code must retain the above copyright
>   *     notice, this list of conditions and the following disclaimer.
>   *  2. Redistributions in binary form must reproduce the above copyright
>   *     notice, this list of conditions and the following disclaimer in
>   *     the documentation and/or other materials provided with the
>   *     distribution.
>   *  3. The names of its contributors may not be used to endorse or promote
>   *     products derived from this software without specific prior written
>   *     permission.
>   *
>   *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
>   *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
>   *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
>   *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
>   *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
>   *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
>   *  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
>   *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
>   *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
>   *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
>   *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> -------------8<------------

Mauro Carvalho Chehab <mchehab@osg.samsung.com>:
> No problem from my side.

Hans Verkuil <hans.verkuil@cisco.com>:
> Acked-by: Hans Verkuil <hans.verkuil@cisco.com>

Aaro Koskinen <aaro.koskinen@nokia.com>:
> This fine also for us.
>
> Acked-by: Aaro Koskinen <aaro.koskinen@nokia.com>

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: Hans Verkuil <hans.verkuil@cisco.com>
Acked-by: Aaro Koskinen <aaro.koskinen@nokia.com>
Acked-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 include/uapi/linux/v4l2-common.h | 46 ++++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/v4l2-common.h b/include/uapi/linux/v4l2-common.h
index 15273987093e..5b3f685a2d50 100644
--- a/include/uapi/linux/v4l2-common.h
+++ b/include/uapi/linux/v4l2-common.h
@@ -10,19 +10,43 @@
  * Copyright (C) 2012 Nokia Corporation
  * Contact: Sakari Ailus <sakari.ailus@iki.fi>
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
  *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
- * 02110-1301 USA
+ *  Alternatively you can redistribute this file under the terms of the
+ *  BSD license as stated below:
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions
+ *  are met:
+ *  1. Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *  2. Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *  3. The names of its contributors may not be used to endorse or promote
+ *     products derived from this software without specific prior written
+ *     permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ *  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
-- 
cgit v1.2.3


From dbf3e7f654c0f06a932b8fcafac78de9d0b81d68 Mon Sep 17 00:00:00 2001
From: Dave Penkler <dpenkler@gmail.com>
Date: Wed, 27 Jan 2016 19:09:24 +0100
Subject: Implement an ioctl to support the USMTMC-USB488 READ_STATUS_BYTE
 operation.

Background:
When performing a read on an instrument that is executing a function
that runs longer than the USB timeout the instrument may hang and
require a device reset to recover. The READ_STATUS_BYTE operation
always returns even when the instrument is busy permitting to poll
for the appropriate condition. This capability is referred to in
instrument application notes on synchronizing acquisitions for other
platforms.

Signed-off-by: Dave Penkler <dpenkler@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/class/usbtmc.c   | 201 +++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/usb/tmc.h |   2 +
 2 files changed, 203 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index 7a11a8263171..c2eb6fe9f4c8 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c
@@ -87,6 +87,19 @@ struct usbtmc_device_data {
 	u8 bTag_last_write;	/* needed for abort */
 	u8 bTag_last_read;	/* needed for abort */
 
+	/* data for interrupt in endpoint handling */
+	u8             bNotify1;
+	u8             bNotify2;
+	u16            ifnum;
+	u8             iin_bTag;
+	u8            *iin_buffer;
+	atomic_t       iin_data_valid;
+	unsigned int   iin_ep;
+	int            iin_ep_present;
+	int            iin_interval;
+	struct urb    *iin_urb;
+	u16            iin_wMaxPacketSize;
+
 	u8 rigol_quirk;
 
 	/* attributes from the USB TMC spec for this device */
@@ -99,6 +112,7 @@ struct usbtmc_device_data {
 	struct usbtmc_dev_capabilities	capabilities;
 	struct kref kref;
 	struct mutex io_mutex;	/* only one i/o function running at a time */
+	wait_queue_head_t waitq;
 };
 #define to_usbtmc_data(d) container_of(d, struct usbtmc_device_data, kref)
 
@@ -373,6 +387,84 @@ exit:
 	return rv;
 }
 
+static int usbtmc488_ioctl_read_stb(struct usbtmc_device_data *data,
+				void __user *arg)
+{
+	struct device *dev = &data->intf->dev;
+	u8 *buffer;
+	u8 tag;
+	__u8 stb;
+	int rv;
+
+	dev_dbg(dev, "Enter ioctl_read_stb iin_ep_present: %d\n",
+		data->iin_ep_present);
+
+	buffer = kmalloc(8, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	atomic_set(&data->iin_data_valid, 0);
+
+	rv = usb_control_msg(data->usb_dev,
+			usb_rcvctrlpipe(data->usb_dev, 0),
+			USBTMC488_REQUEST_READ_STATUS_BYTE,
+			USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE,
+			data->iin_bTag,
+			data->ifnum,
+			buffer, 0x03, USBTMC_TIMEOUT);
+	if (rv < 0) {
+		dev_err(dev, "stb usb_control_msg returned %d\n", rv);
+		goto exit;
+	}
+
+	if (buffer[0] != USBTMC_STATUS_SUCCESS) {
+		dev_err(dev, "control status returned %x\n", buffer[0]);
+		rv = -EIO;
+		goto exit;
+	}
+
+	if (data->iin_ep_present) {
+		rv = wait_event_interruptible_timeout(
+			data->waitq,
+			atomic_read(&data->iin_data_valid) != 0,
+			USBTMC_TIMEOUT);
+		if (rv < 0) {
+			dev_dbg(dev, "wait interrupted %d\n", rv);
+			goto exit;
+		}
+
+		if (rv == 0) {
+			dev_dbg(dev, "wait timed out\n");
+			rv = -ETIME;
+			goto exit;
+		}
+
+		tag = data->bNotify1 & 0x7f;
+		if (tag != data->iin_bTag) {
+			dev_err(dev, "expected bTag %x got %x\n",
+				data->iin_bTag, tag);
+		}
+
+		stb = data->bNotify2;
+	} else {
+		stb = buffer[2];
+	}
+
+	rv = copy_to_user(arg, &stb, sizeof(stb));
+	if (rv)
+		rv = -EFAULT;
+
+ exit:
+	/* bump interrupt bTag */
+	data->iin_bTag += 1;
+	if (data->iin_bTag > 127)
+		/* 1 is for SRQ see USBTMC-USB488 subclass spec section 4.3.1 */
+		data->iin_bTag = 2;
+
+	kfree(buffer);
+	return rv;
+}
+
 /*
  * Sends a REQUEST_DEV_DEP_MSG_IN message on the Bulk-IN endpoint.
  * @transfer_size: number of bytes to request from the device.
@@ -1069,6 +1161,10 @@ static long usbtmc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case USBTMC_IOCTL_ABORT_BULK_IN:
 		retval = usbtmc_ioctl_abort_bulk_in(data);
 		break;
+
+	case USBTMC488_IOCTL_READ_STB:
+		retval = usbtmc488_ioctl_read_stb(data, (void __user *)arg);
+		break;
 	}
 
 skip_io_on_zombie:
@@ -1092,6 +1188,57 @@ static struct usb_class_driver usbtmc_class = {
 	.minor_base =	USBTMC_MINOR_BASE,
 };
 
+static void usbtmc_interrupt(struct urb *urb)
+{
+	struct usbtmc_device_data *data = urb->context;
+	struct device *dev = &data->intf->dev;
+	int status = urb->status;
+	int rv;
+
+	dev_dbg(&data->intf->dev, "int status: %d len %d\n",
+		status, urb->actual_length);
+
+	switch (status) {
+	case 0: /* SUCCESS */
+		/* check for valid STB notification */
+		if (data->iin_buffer[0] > 0x81) {
+			data->bNotify1 = data->iin_buffer[0];
+			data->bNotify2 = data->iin_buffer[1];
+			atomic_set(&data->iin_data_valid, 1);
+			wake_up_interruptible(&data->waitq);
+			goto exit;
+		}
+		dev_warn(dev, "invalid notification: %x\n", data->iin_buffer[0]);
+		break;
+	case -EOVERFLOW:
+		dev_err(dev, "overflow with length %d, actual length is %d\n",
+			data->iin_wMaxPacketSize, urb->actual_length);
+	case -ECONNRESET:
+	case -ENOENT:
+	case -ESHUTDOWN:
+	case -EILSEQ:
+	case -ETIME:
+		/* urb terminated, clean up */
+		dev_dbg(dev, "urb terminated, status: %d\n", status);
+		return;
+	default:
+		dev_err(dev, "unknown status received: %d\n", status);
+	}
+exit:
+	rv = usb_submit_urb(urb, GFP_ATOMIC);
+	if (rv)
+		dev_err(dev, "usb_submit_urb failed: %d\n", rv);
+}
+
+static void usbtmc_free_int(struct usbtmc_device_data *data)
+{
+	if (!data->iin_ep_present || !data->iin_urb)
+		return;
+	usb_kill_urb(data->iin_urb);
+	kfree(data->iin_buffer);
+	usb_free_urb(data->iin_urb);
+	kref_put(&data->kref, usbtmc_delete);
+}
 
 static int usbtmc_probe(struct usb_interface *intf,
 			const struct usb_device_id *id)
@@ -1114,6 +1261,8 @@ static int usbtmc_probe(struct usb_interface *intf,
 	usb_set_intfdata(intf, data);
 	kref_init(&data->kref);
 	mutex_init(&data->io_mutex);
+	init_waitqueue_head(&data->waitq);
+	atomic_set(&data->iin_data_valid, 0);
 	data->zombie = 0;
 
 	/* Determine if it is a Rigol or not */
@@ -1134,9 +1283,12 @@ static int usbtmc_probe(struct usb_interface *intf,
 	data->bTag	= 1;
 	data->TermCharEnabled = 0;
 	data->TermChar = '\n';
+	/*  2 <= bTag <= 127   USBTMC-USB488 subclass specification 4.3.1 */
+	data->iin_bTag = 2;
 
 	/* USBTMC devices have only one setting, so use that */
 	iface_desc = data->intf->cur_altsetting;
+	data->ifnum = iface_desc->desc.bInterfaceNumber;
 
 	/* Find bulk in endpoint */
 	for (n = 0; n < iface_desc->desc.bNumEndpoints; n++) {
@@ -1161,6 +1313,20 @@ static int usbtmc_probe(struct usb_interface *intf,
 			break;
 		}
 	}
+	/* Find int endpoint */
+	for (n = 0; n < iface_desc->desc.bNumEndpoints; n++) {
+		endpoint = &iface_desc->endpoint[n].desc;
+
+		if (usb_endpoint_is_int_in(endpoint)) {
+			data->iin_ep_present = 1;
+			data->iin_ep = endpoint->bEndpointAddress;
+			data->iin_wMaxPacketSize = usb_endpoint_maxp(endpoint);
+			data->iin_interval = endpoint->bInterval;
+			dev_dbg(&intf->dev, "Found Int in endpoint at %u\n",
+				data->iin_ep);
+			break;
+		}
+	}
 
 	retcode = get_capabilities(data);
 	if (retcode)
@@ -1169,6 +1335,39 @@ static int usbtmc_probe(struct usb_interface *intf,
 		retcode = sysfs_create_group(&intf->dev.kobj,
 					     &capability_attr_grp);
 
+	if (data->iin_ep_present) {
+		/* allocate int urb */
+		data->iin_urb = usb_alloc_urb(0, GFP_KERNEL);
+		if (!data->iin_urb) {
+			dev_err(&intf->dev, "Failed to allocate int urb\n");
+			goto error_register;
+		}
+
+		/* will reference data in int urb */
+		kref_get(&data->kref);
+
+		/* allocate buffer for interrupt in */
+		data->iin_buffer = kmalloc(data->iin_wMaxPacketSize,
+					GFP_KERNEL);
+		if (!data->iin_buffer) {
+			dev_err(&intf->dev, "Failed to allocate int buf\n");
+			goto error_register;
+		}
+
+		/* fill interrupt urb */
+		usb_fill_int_urb(data->iin_urb, data->usb_dev,
+				usb_rcvintpipe(data->usb_dev, data->iin_ep),
+				data->iin_buffer, data->iin_wMaxPacketSize,
+				usbtmc_interrupt,
+				data, data->iin_interval);
+
+		retcode = usb_submit_urb(data->iin_urb, GFP_KERNEL);
+		if (retcode) {
+			dev_err(&intf->dev, "Failed to submit iin_urb\n");
+			goto error_register;
+		}
+	}
+
 	retcode = sysfs_create_group(&intf->dev.kobj, &data_attr_grp);
 
 	retcode = usb_register_dev(intf, &usbtmc_class);
@@ -1185,6 +1384,7 @@ static int usbtmc_probe(struct usb_interface *intf,
 error_register:
 	sysfs_remove_group(&intf->dev.kobj, &capability_attr_grp);
 	sysfs_remove_group(&intf->dev.kobj, &data_attr_grp);
+	usbtmc_free_int(data);
 	kref_put(&data->kref, usbtmc_delete);
 	return retcode;
 }
@@ -1196,6 +1396,7 @@ static void usbtmc_disconnect(struct usb_interface *intf)
 	dev_dbg(&intf->dev, "usbtmc_disconnect called\n");
 
 	data = usb_get_intfdata(intf);
+	usbtmc_free_int(data);
 	usb_deregister_dev(intf, &usbtmc_class);
 	sysfs_remove_group(&intf->dev.kobj, &capability_attr_grp);
 	sysfs_remove_group(&intf->dev.kobj, &data_attr_grp);
diff --git a/include/uapi/linux/usb/tmc.h b/include/uapi/linux/usb/tmc.h
index c045ae12556c..7e5ced80548f 100644
--- a/include/uapi/linux/usb/tmc.h
+++ b/include/uapi/linux/usb/tmc.h
@@ -30,6 +30,7 @@
 #define USBTMC_REQUEST_CHECK_CLEAR_STATUS		6
 #define USBTMC_REQUEST_GET_CAPABILITIES			7
 #define USBTMC_REQUEST_INDICATOR_PULSE			64
+#define USBTMC488_REQUEST_READ_STATUS_BYTE		128
 
 /* Request values for USBTMC driver's ioctl entry point */
 #define USBTMC_IOC_NR			91
@@ -39,5 +40,6 @@
 #define USBTMC_IOCTL_ABORT_BULK_IN	_IO(USBTMC_IOC_NR, 4)
 #define USBTMC_IOCTL_CLEAR_OUT_HALT	_IO(USBTMC_IOC_NR, 6)
 #define USBTMC_IOCTL_CLEAR_IN_HALT	_IO(USBTMC_IOC_NR, 7)
+#define USBTMC488_IOCTL_READ_STB	_IOR(USBTMC_IOC_NR, 18, unsigned char)
 
 #endif
-- 
cgit v1.2.3


From 29779d89fd049bfc6c07f19aaf9b8d19fe2ecc8c Mon Sep 17 00:00:00 2001
From: Dave Penkler <dpenkler@gmail.com>
Date: Wed, 27 Jan 2016 19:22:28 +0100
Subject: Add ioctl to retrieve USBTMC-USB488 capabilities

This is a convenience function to obtain an instrument's
capabilities from its file descriptor without having to access sysfs
from the user program.

Signed-off-by: Dave Penkler <dpenkler@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/class/usbtmc.c   | 12 ++++++++++++
 include/uapi/linux/usb/tmc.h | 21 ++++++++++++++++++---
 2 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index 867842951ea0..6edb21ca9989 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c
@@ -102,6 +102,9 @@ struct usbtmc_device_data {
 	u16            iin_wMaxPacketSize;
 	atomic_t       srq_asserted;
 
+	/* coalesced usb488_caps from usbtmc_dev_capabilities */
+	__u8 usb488_caps;
+
 	u8 rigol_quirk;
 
 	/* attributes from the USB TMC spec for this device */
@@ -993,6 +996,7 @@ static int get_capabilities(struct usbtmc_device_data *data)
 	data->capabilities.device_capabilities = buffer[5];
 	data->capabilities.usb488_interface_capabilities = buffer[14];
 	data->capabilities.usb488_device_capabilities = buffer[15];
+	data->usb488_caps = (buffer[14] & 0x07) | ((buffer[15] & 0x0f) << 4);
 	rv = 0;
 
 err_out:
@@ -1168,6 +1172,14 @@ static long usbtmc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		retval = usbtmc_ioctl_abort_bulk_in(data);
 		break;
 
+	case USBTMC488_IOCTL_GET_CAPS:
+		retval = copy_to_user((void __user *)arg,
+				&data->usb488_caps,
+				sizeof(data->usb488_caps));
+		if (retval)
+			retval = -EFAULT;
+		break;
+
 	case USBTMC488_IOCTL_READ_STB:
 		retval = usbtmc488_ioctl_read_stb(data, (void __user *)arg);
 		break;
diff --git a/include/uapi/linux/usb/tmc.h b/include/uapi/linux/usb/tmc.h
index 7e5ced80548f..b512fb2d403e 100644
--- a/include/uapi/linux/usb/tmc.h
+++ b/include/uapi/linux/usb/tmc.h
@@ -2,12 +2,14 @@
  * Copyright (C) 2007 Stefan Kopp, Gechingen, Germany
  * Copyright (C) 2008 Novell, Inc.
  * Copyright (C) 2008 Greg Kroah-Hartman <gregkh@suse.de>
+ * Copyright (C) 2015 Dave Penkler <dpenkler@gmail.com>
  *
  * This file holds USB constants defined by the USB Device Class
- * Definition for Test and Measurement devices published by the USB-IF.
+ * and USB488 Subclass Definitions for Test and Measurement devices
+ * published by the USB-IF.
  *
- * It also has the ioctl definitions for the usbtmc kernel driver that
- * userspace needs to know about.
+ * It also has the ioctl and capability definitions for the
+ * usbtmc kernel driver that userspace needs to know about.
  */
 
 #ifndef __LINUX_USB_TMC_H
@@ -40,6 +42,19 @@
 #define USBTMC_IOCTL_ABORT_BULK_IN	_IO(USBTMC_IOC_NR, 4)
 #define USBTMC_IOCTL_CLEAR_OUT_HALT	_IO(USBTMC_IOC_NR, 6)
 #define USBTMC_IOCTL_CLEAR_IN_HALT	_IO(USBTMC_IOC_NR, 7)
+#define USBTMC488_IOCTL_GET_CAPS	_IOR(USBTMC_IOC_NR, 17, unsigned char)
 #define USBTMC488_IOCTL_READ_STB	_IOR(USBTMC_IOC_NR, 18, unsigned char)
 
+/* Driver encoded usb488 capabilities */
+#define USBTMC488_CAPABILITY_TRIGGER         1
+#define USBTMC488_CAPABILITY_SIMPLE          2
+#define USBTMC488_CAPABILITY_REN_CONTROL     2
+#define USBTMC488_CAPABILITY_GOTO_LOCAL      2
+#define USBTMC488_CAPABILITY_LOCAL_LOCKOUT   2
+#define USBTMC488_CAPABILITY_488_DOT_2       4
+#define USBTMC488_CAPABILITY_DT1             16
+#define USBTMC488_CAPABILITY_RL1             32
+#define USBTMC488_CAPABILITY_SR1             64
+#define USBTMC488_CAPABILITY_FULL_SCPI       128
+
 #endif
-- 
cgit v1.2.3


From 379d3d33c83b667b0edad0110693567306463882 Mon Sep 17 00:00:00 2001
From: Dave Penkler <dpenkler@gmail.com>
Date: Wed, 27 Jan 2016 19:25:24 +0100
Subject: Add ioctls to enable and disable local controls on an instrument

These ioctls provide support for the USBTMC-USB488 control requests
for REN_CONTROL, GO_TO_LOCAL and LOCAL_LOCKOUT

Signed-off-by: Dave Penkler <dpenkler@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/class/usbtmc.c   | 70 ++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/usb/tmc.h |  6 ++++
 2 files changed, 76 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index 6edb21ca9989..419c72e10464 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c
@@ -474,6 +474,61 @@ static int usbtmc488_ioctl_read_stb(struct usbtmc_device_data *data,
 	return rv;
 }
 
+static int usbtmc488_ioctl_simple(struct usbtmc_device_data *data,
+				void __user *arg, unsigned int cmd)
+{
+	struct device *dev = &data->intf->dev;
+	__u8 val;
+	u8 *buffer;
+	u16 wValue;
+	int rv;
+
+	if (!(data->usb488_caps & USBTMC488_CAPABILITY_SIMPLE))
+		return -EINVAL;
+
+	buffer = kmalloc(8, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	if (cmd == USBTMC488_REQUEST_REN_CONTROL) {
+		rv = copy_from_user(&val, arg, sizeof(val));
+		if (rv) {
+			rv = -EFAULT;
+			goto exit;
+		}
+		wValue = val ? 1 : 0;
+	} else {
+		wValue = 0;
+	}
+
+	rv = usb_control_msg(data->usb_dev,
+			usb_rcvctrlpipe(data->usb_dev, 0),
+			cmd,
+			USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE,
+			wValue,
+			data->ifnum,
+			buffer, 0x01, USBTMC_TIMEOUT);
+	if (rv < 0) {
+		dev_err(dev, "simple usb_control_msg failed %d\n", rv);
+		goto exit;
+	} else if (rv != 1) {
+		dev_warn(dev, "simple usb_control_msg returned %d\n", rv);
+		rv = -EIO;
+		goto exit;
+	}
+
+	if (buffer[0] != USBTMC_STATUS_SUCCESS) {
+		dev_err(dev, "simple control status returned %x\n", buffer[0]);
+		rv = -EIO;
+		goto exit;
+	}
+	rv = 0;
+
+ exit:
+	kfree(buffer);
+	return rv;
+}
+
 /*
  * Sends a REQUEST_DEV_DEP_MSG_IN message on the Bulk-IN endpoint.
  * @transfer_size: number of bytes to request from the device.
@@ -1183,6 +1238,21 @@ static long usbtmc_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case USBTMC488_IOCTL_READ_STB:
 		retval = usbtmc488_ioctl_read_stb(data, (void __user *)arg);
 		break;
+
+	case USBTMC488_IOCTL_REN_CONTROL:
+		retval = usbtmc488_ioctl_simple(data, (void __user *)arg,
+						USBTMC488_REQUEST_REN_CONTROL);
+		break;
+
+	case USBTMC488_IOCTL_GOTO_LOCAL:
+		retval = usbtmc488_ioctl_simple(data, (void __user *)arg,
+						USBTMC488_REQUEST_GOTO_LOCAL);
+		break;
+
+	case USBTMC488_IOCTL_LOCAL_LOCKOUT:
+		retval = usbtmc488_ioctl_simple(data, (void __user *)arg,
+						USBTMC488_REQUEST_LOCAL_LOCKOUT);
+		break;
 	}
 
 skip_io_on_zombie:
diff --git a/include/uapi/linux/usb/tmc.h b/include/uapi/linux/usb/tmc.h
index b512fb2d403e..2e59d9c50b8d 100644
--- a/include/uapi/linux/usb/tmc.h
+++ b/include/uapi/linux/usb/tmc.h
@@ -33,6 +33,9 @@
 #define USBTMC_REQUEST_GET_CAPABILITIES			7
 #define USBTMC_REQUEST_INDICATOR_PULSE			64
 #define USBTMC488_REQUEST_READ_STATUS_BYTE		128
+#define USBTMC488_REQUEST_REN_CONTROL			160
+#define USBTMC488_REQUEST_GOTO_LOCAL			161
+#define USBTMC488_REQUEST_LOCAL_LOCKOUT			162
 
 /* Request values for USBTMC driver's ioctl entry point */
 #define USBTMC_IOC_NR			91
@@ -44,6 +47,9 @@
 #define USBTMC_IOCTL_CLEAR_IN_HALT	_IO(USBTMC_IOC_NR, 7)
 #define USBTMC488_IOCTL_GET_CAPS	_IOR(USBTMC_IOC_NR, 17, unsigned char)
 #define USBTMC488_IOCTL_READ_STB	_IOR(USBTMC_IOC_NR, 18, unsigned char)
+#define USBTMC488_IOCTL_REN_CONTROL	_IOW(USBTMC_IOC_NR, 19, unsigned char)
+#define USBTMC488_IOCTL_GOTO_LOCAL	_IO(USBTMC_IOC_NR, 20)
+#define USBTMC488_IOCTL_LOCAL_LOCKOUT	_IO(USBTMC_IOC_NR, 21)
 
 /* Driver encoded usb488 capabilities */
 #define USBTMC488_CAPABILITY_TRIGGER         1
-- 
cgit v1.2.3


From 6e7333d315a768170a59ac771297ee0551bdddbf Mon Sep 17 00:00:00 2001
From: Jarod Wilson <jarod@redhat.com>
Date: Mon, 1 Feb 2016 18:51:05 -0500
Subject: net: add rx_nohandler stat counter

This adds an rx_nohandler stat counter, along with a sysfs statistics
node, and copies the counter out via netlink as well.

CC: "David S. Miller" <davem@davemloft.net>
CC: Eric Dumazet <edumazet@google.com>
CC: Jiri Pirko <jiri@mellanox.com>
CC: Daniel Borkmann <daniel@iogearbox.net>
CC: Tom Herbert <tom@herbertland.com>
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <gospo@cumulusnetworks.com>
CC: netdev@vger.kernel.org
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    | 3 +++
 include/uapi/linux/if_link.h | 4 ++++
 net/core/dev.c               | 6 +++++-
 net/core/net-sysfs.c         | 2 ++
 net/core/rtnetlink.c         | 2 ++
 5 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 289c2314d766..78a20cec2a0a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1397,6 +1397,8 @@ enum netdev_priv_flags {
  *			do not use this in drivers
  *	@tx_dropped:	Dropped packets by core network,
  *			do not use this in drivers
+ *	@rx_nohandler:	nohandler dropped packets by core network on
+ *			inactive devices, do not use this in drivers
  *
  *	@wireless_handlers:	List of functions to handle Wireless Extensions,
  *				instead of ioctl,
@@ -1611,6 +1613,7 @@ struct net_device {
 
 	atomic_long_t		rx_dropped;
 	atomic_long_t		tx_dropped;
+	atomic_long_t		rx_nohandler;
 
 #ifdef CONFIG_WIRELESS_EXT
 	const struct iw_handler_def *	wireless_handlers;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index a30b78090594..d3e90b91e07e 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -35,6 +35,8 @@ struct rtnl_link_stats {
 	/* for cslip etc */
 	__u32	rx_compressed;
 	__u32	tx_compressed;
+
+	__u32	rx_nohandler;		/* dropped, no handler found	*/
 };
 
 /* The main device statistics structure */
@@ -68,6 +70,8 @@ struct rtnl_link_stats64 {
 	/* for cslip etc */
 	__u64	rx_compressed;
 	__u64	tx_compressed;
+
+	__u64	rx_nohandler;		/* dropped, no handler found	*/
 };
 
 /* The struct should be in sync with struct ifmap */
diff --git a/net/core/dev.c b/net/core/dev.c
index 65863e512227..f1284835b8c9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4154,7 +4154,10 @@ ncls:
 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 	} else {
 drop:
-		atomic_long_inc(&skb->dev->rx_dropped);
+		if (!deliver_exact)
+			atomic_long_inc(&skb->dev->rx_dropped);
+		else
+			atomic_long_inc(&skb->dev->rx_nohandler);
 		kfree_skb(skb);
 		/* Jamal, now you will not able to escape explaining
 		 * me how you were going to use this. :-)
@@ -7307,6 +7310,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
 	}
 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
 	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
+	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
 	return storage;
 }
 EXPORT_SYMBOL(dev_get_stats);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index b6c8a6629b39..da7dbc237a5f 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -574,6 +574,7 @@ NETSTAT_ENTRY(tx_heartbeat_errors);
 NETSTAT_ENTRY(tx_window_errors);
 NETSTAT_ENTRY(rx_compressed);
 NETSTAT_ENTRY(tx_compressed);
+NETSTAT_ENTRY(rx_nohandler);
 
 static struct attribute *netstat_attrs[] = {
 	&dev_attr_rx_packets.attr,
@@ -599,6 +600,7 @@ static struct attribute *netstat_attrs[] = {
 	&dev_attr_tx_window_errors.attr,
 	&dev_attr_rx_compressed.attr,
 	&dev_attr_tx_compressed.attr,
+	&dev_attr_rx_nohandler.attr,
 	NULL
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d735e854f916..20d71358c143 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -804,6 +804,8 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
 
 	a->rx_compressed = b->rx_compressed;
 	a->tx_compressed = b->tx_compressed;
+
+	a->rx_nohandler = b->rx_nohandler;
 }
 
 static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
-- 
cgit v1.2.3


From 824bd0ce6c7c43a9e1e210abf124958e54d88342 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 1 Feb 2016 22:39:53 -0800
Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_HASH map

Introduce BPF_MAP_TYPE_PERCPU_HASH map type which is used to do
accurate counters without need to use BPF_XADD instruction which turned
out to be too costly for high-performance network monitoring.
In the typical use case the 'key' is the flow tuple or other long
living object that sees a lot of events per second.

bpf_map_lookup_elem() returns per-cpu area.
Example:
struct {
  u32 packets;
  u32 bytes;
} * ptr = bpf_map_lookup_elem(&map, &key);
/* ptr points to this_cpu area of the value, so the following
 * increments will not collide with other cpus
 */
ptr->packets ++;
ptr->bytes += skb->len;

bpf_update_elem() atomically creates a new element where all per-cpu
values are zero initialized and this_cpu value is populated with
given 'value'.
Note that non-per-cpu hash map always allocates new element
and then deletes old after rcu grace period to maintain atomicity
of update. Per-cpu hash map updates element values in-place.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |   1 +
 kernel/bpf/hashtab.c     | 275 +++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 229 insertions(+), 47 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa6f8571de13..43ae40c8763e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -81,6 +81,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_ARRAY,
 	BPF_MAP_TYPE_PROG_ARRAY,
 	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	BPF_MAP_TYPE_PERCPU_HASH,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index c5b30fd8a315..2be5f6e8bb04 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -31,21 +31,27 @@ struct bpf_htab {
 struct htab_elem {
 	struct hlist_node hash_node;
 	struct rcu_head rcu;
-	u32 hash;
+	union {
+		u32 hash;
+		u32 key_size;
+	};
 	char key[0] __aligned(8);
 };
 
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
+	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
 	struct bpf_htab *htab;
 	int err, i;
+	u64 cost;
 
 	htab = kzalloc(sizeof(*htab), GFP_USER);
 	if (!htab)
 		return ERR_PTR(-ENOMEM);
 
 	/* mandatory map attributes */
+	htab->map.map_type = attr->map_type;
 	htab->map.key_size = attr->key_size;
 	htab->map.value_size = attr->value_size;
 	htab->map.max_entries = attr->max_entries;
@@ -77,24 +83,34 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		 */
 		goto free_htab;
 
+	if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
+		/* make sure the size for pcpu_alloc() is reasonable */
+		goto free_htab;
+
 	htab->elem_size = sizeof(struct htab_elem) +
-			  round_up(htab->map.key_size, 8) +
-			  htab->map.value_size;
+			  round_up(htab->map.key_size, 8);
+	if (percpu)
+		htab->elem_size += sizeof(void *);
+	else
+		htab->elem_size += htab->map.value_size;
 
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
 	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
 		goto free_htab;
 
-	if ((u64) htab->n_buckets * sizeof(struct bucket) +
-	    (u64) htab->elem_size * htab->map.max_entries >=
-	    U32_MAX - PAGE_SIZE)
+	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
+	       (u64) htab->elem_size * htab->map.max_entries;
+
+	if (percpu)
+		cost += (u64) round_up(htab->map.value_size, 8) *
+			num_possible_cpus() * htab->map.max_entries;
+
+	if (cost >= U32_MAX - PAGE_SIZE)
 		/* make sure page count doesn't overflow */
 		goto free_htab;
 
-	htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
-				   htab->elem_size * htab->map.max_entries,
-				   PAGE_SIZE) >> PAGE_SHIFT;
+	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
 	err = -ENOMEM;
 	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
@@ -148,7 +164,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
 }
 
 /* Called from syscall or from eBPF program */
-static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
 	struct hlist_head *head;
@@ -166,6 +182,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
 
 	l = lookup_elem_raw(head, hash, key, key_size);
 
+	return l;
+}
+
+static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
 	if (l)
 		return l->key + round_up(map->key_size, 8);
 
@@ -230,65 +253,139 @@ find_first_elem:
 	return -ENOENT;
 }
 
+
+static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
+				     void __percpu *pptr)
+{
+	*(void __percpu **)(l->key + key_size) = pptr;
+}
+
+static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
+{
+	return *(void __percpu **)(l->key + key_size);
+}
+
+static void htab_percpu_elem_free(struct htab_elem *l)
+{
+	free_percpu(htab_elem_get_ptr(l, l->key_size));
+	kfree(l);
+}
+
+static void htab_percpu_elem_free_rcu(struct rcu_head *head)
+{
+	struct htab_elem *l = container_of(head, struct htab_elem, rcu);
+
+	htab_percpu_elem_free(l);
+}
+
+static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
+{
+	if (percpu) {
+		l->key_size = key_size;
+		call_rcu(&l->rcu, htab_percpu_elem_free_rcu);
+	} else {
+		kfree_rcu(l, rcu);
+	}
+}
+
+static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
+					 void *value, u32 key_size, u32 hash,
+					 bool percpu)
+{
+	u32 size = htab->map.value_size;
+	struct htab_elem *l_new;
+	void __percpu *pptr;
+
+	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+	if (!l_new)
+		return NULL;
+
+	memcpy(l_new->key, key, key_size);
+	if (percpu) {
+		/* round up value_size to 8 bytes */
+		size = round_up(size, 8);
+
+		/* alloc_percpu zero-fills */
+		pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN);
+		if (!pptr) {
+			kfree(l_new);
+			return NULL;
+		}
+
+		/* copy true value_size bytes */
+		memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
+		htab_elem_set_ptr(l_new, key_size, pptr);
+	} else {
+		memcpy(l_new->key + round_up(key_size, 8), value, size);
+	}
+
+	l_new->hash = hash;
+	return l_new;
+}
+
+static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
+		       u64 map_flags)
+{
+	if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries))
+		/* if elem with this 'key' doesn't exist and we've reached
+		 * max_entries limit, fail insertion of new elem
+		 */
+		return -E2BIG;
+
+	if (l_old && map_flags == BPF_NOEXIST)
+		/* elem already exists */
+		return -EEXIST;
+
+	if (!l_old && map_flags == BPF_EXIST)
+		/* elem doesn't exist, cannot update it */
+		return -ENOENT;
+
+	return 0;
+}
+
 /* Called from syscall or from eBPF program */
 static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 				u64 map_flags)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct htab_elem *l_new, *l_old;
+	struct htab_elem *l_new = NULL, *l_old;
 	struct hlist_head *head;
-	struct bucket *b;
 	unsigned long flags;
-	u32 key_size;
+	struct bucket *b;
+	u32 key_size, hash;
 	int ret;
 
-	if (map_flags > BPF_EXIST)
+	if (unlikely(map_flags > BPF_EXIST))
 		/* unknown flags */
 		return -EINVAL;
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	/* allocate new element outside of lock */
-	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
-	if (!l_new)
-		return -ENOMEM;
-
 	key_size = map->key_size;
 
-	memcpy(l_new->key, key, key_size);
-	memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
+	hash = htab_map_hash(key, key_size);
+
+	/* allocate new element outside of the lock, since
+	 * we're most likley going to insert it
+	 */
+	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false);
+	if (!l_new)
+		return -ENOMEM;
 
-	l_new->hash = htab_map_hash(l_new->key, key_size);
-	b = __select_bucket(htab, l_new->hash);
+	b = __select_bucket(htab, hash);
 	head = &b->head;
 
 	/* bpf_map_update_elem() can be called in_irq() */
 	raw_spin_lock_irqsave(&b->lock, flags);
 
-	l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
+	l_old = lookup_elem_raw(head, hash, key, key_size);
 
-	if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
-		/* if elem with this 'key' doesn't exist and we've reached
-		 * max_entries limit, fail insertion of new elem
-		 */
-		ret = -E2BIG;
+	ret = check_flags(htab, l_old, map_flags);
+	if (ret)
 		goto err;
-	}
 
-	if (l_old && map_flags == BPF_NOEXIST) {
-		/* elem already exists */
-		ret = -EEXIST;
-		goto err;
-	}
-
-	if (!l_old && map_flags == BPF_EXIST) {
-		/* elem doesn't exist, cannot update it */
-		ret = -ENOENT;
-		goto err;
-	}
-
-	/* add new element to the head of the list, so that concurrent
-	 * search will find it before old elem
+	/* add new element to the head of the list, so that
+	 * concurrent search will find it before old elem
 	 */
 	hlist_add_head_rcu(&l_new->hash_node, head);
 	if (l_old) {
@@ -298,7 +395,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 		atomic_inc(&htab->count);
 	}
 	raw_spin_unlock_irqrestore(&b->lock, flags);
-
 	return 0;
 err:
 	raw_spin_unlock_irqrestore(&b->lock, flags);
@@ -306,10 +402,64 @@ err:
 	return ret;
 }
 
+static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+				       void *value, u64 map_flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct htab_elem *l_new = NULL, *l_old;
+	struct hlist_head *head;
+	unsigned long flags;
+	struct bucket *b;
+	u32 key_size, hash;
+	int ret;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		/* unknown flags */
+		return -EINVAL;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	key_size = map->key_size;
+
+	hash = htab_map_hash(key, key_size);
+
+	b = __select_bucket(htab, hash);
+	head = &b->head;
+
+	/* bpf_map_update_elem() can be called in_irq() */
+	raw_spin_lock_irqsave(&b->lock, flags);
+
+	l_old = lookup_elem_raw(head, hash, key, key_size);
+
+	ret = check_flags(htab, l_old, map_flags);
+	if (ret)
+		goto err;
+
+	if (l_old) {
+		/* per-cpu hash map can update value in-place */
+		memcpy(this_cpu_ptr(htab_elem_get_ptr(l_old, key_size)),
+		       value, htab->map.value_size);
+	} else {
+		l_new = alloc_htab_elem(htab, key, value, key_size,
+					hash, true);
+		if (!l_new) {
+			ret = -ENOMEM;
+			goto err;
+		}
+		hlist_add_head_rcu(&l_new->hash_node, head);
+		atomic_inc(&htab->count);
+	}
+	ret = 0;
+err:
+	raw_spin_unlock_irqrestore(&b->lock, flags);
+	return ret;
+}
+
 /* Called from syscall or from eBPF program */
 static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH;
 	struct hlist_head *head;
 	struct bucket *b;
 	struct htab_elem *l;
@@ -332,7 +482,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 	if (l) {
 		hlist_del_rcu(&l->hash_node);
 		atomic_dec(&htab->count);
-		kfree_rcu(l, rcu);
+		free_htab_elem(l, percpu, key_size);
 		ret = 0;
 	}
 
@@ -352,7 +502,12 @@ static void delete_all_elements(struct bpf_htab *htab)
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			hlist_del_rcu(&l->hash_node);
 			atomic_dec(&htab->count);
-			kfree(l);
+			if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) {
+				l->key_size = htab->map.key_size;
+				htab_percpu_elem_free(l);
+			} else {
+				kfree(l);
+			}
 		}
 	}
 }
@@ -391,9 +546,35 @@ static struct bpf_map_type_list htab_type __read_mostly = {
 	.type = BPF_MAP_TYPE_HASH,
 };
 
+/* Called from eBPF program */
+static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct htab_elem *l = __htab_map_lookup_elem(map, key);
+
+	if (l)
+		return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
+	else
+		return NULL;
+}
+
+static const struct bpf_map_ops htab_percpu_ops = {
+	.map_alloc = htab_map_alloc,
+	.map_free = htab_map_free,
+	.map_get_next_key = htab_map_get_next_key,
+	.map_lookup_elem = htab_percpu_map_lookup_elem,
+	.map_update_elem = htab_percpu_map_update_elem,
+	.map_delete_elem = htab_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_percpu_type __read_mostly = {
+	.ops = &htab_percpu_ops,
+	.type = BPF_MAP_TYPE_PERCPU_HASH,
+};
+
 static int __init register_htab_map(void)
 {
 	bpf_register_map_type(&htab_type);
+	bpf_register_map_type(&htab_percpu_type);
 	return 0;
 }
 late_initcall(register_htab_map);
-- 
cgit v1.2.3


From a10423b87a7eae75da79ce80a8d9475047a674ee Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 1 Feb 2016 22:39:54 -0800
Subject: bpf: introduce BPF_MAP_TYPE_PERCPU_ARRAY map

Primary use case is a histogram array of latency
where bpf program computes the latency of block requests or other
events and stores histogram of latency into array of 64 elements.
All cpus are constantly running, so normal increment is not accurate,
bpf_xadd causes cache ping-pong and this per-cpu approach allows
fastest collision-free counters.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   1 +
 include/uapi/linux/bpf.h |   1 +
 kernel/bpf/arraymap.c    | 102 ++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 93 insertions(+), 11 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 83d1926c61e4..141fb0d45731 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -151,6 +151,7 @@ struct bpf_array {
 	union {
 		char value[0] __aligned(8);
 		void *ptrs[0] __aligned(8);
+		void __percpu *pptrs[0] __aligned(8);
 	};
 };
 #define MAX_TAIL_CALL_CNT 32
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 43ae40c8763e..2ee0fde1bf96 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -82,6 +82,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PROG_ARRAY,
 	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 	BPF_MAP_TYPE_PERCPU_HASH,
+	BPF_MAP_TYPE_PERCPU_ARRAY,
 };
 
 enum bpf_prog_type {
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 89ebbc4d1164..b9bf1d7949ca 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -17,11 +17,39 @@
 #include <linux/filter.h>
 #include <linux/perf_event.h>
 
+static void bpf_array_free_percpu(struct bpf_array *array)
+{
+	int i;
+
+	for (i = 0; i < array->map.max_entries; i++)
+		free_percpu(array->pptrs[i]);
+}
+
+static int bpf_array_alloc_percpu(struct bpf_array *array)
+{
+	void __percpu *ptr;
+	int i;
+
+	for (i = 0; i < array->map.max_entries; i++) {
+		ptr = __alloc_percpu_gfp(array->elem_size, 8,
+					 GFP_USER | __GFP_NOWARN);
+		if (!ptr) {
+			bpf_array_free_percpu(array);
+			return -ENOMEM;
+		}
+		array->pptrs[i] = ptr;
+	}
+
+	return 0;
+}
+
 /* Called from syscall */
 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 {
+	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	struct bpf_array *array;
-	u32 elem_size, array_size;
+	u64 array_size;
+	u32 elem_size;
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -36,12 +64,16 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	elem_size = round_up(attr->value_size, 8);
 
-	/* check round_up into zero and u32 overflow */
-	if (elem_size == 0 ||
-	    attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size)
+	array_size = sizeof(*array);
+	if (percpu)
+		array_size += (u64) attr->max_entries * sizeof(void *);
+	else
+		array_size += (u64) attr->max_entries * elem_size;
+
+	/* make sure there is no u32 overflow later in round_up() */
+	if (array_size >= U32_MAX - PAGE_SIZE)
 		return ERR_PTR(-ENOMEM);
 
-	array_size = sizeof(*array) + attr->max_entries * elem_size;
 
 	/* allocate all map elements and zero-initialize them */
 	array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
@@ -52,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	}
 
 	/* copy mandatory map attributes */
+	array->map.map_type = attr->map_type;
 	array->map.key_size = attr->key_size;
 	array->map.value_size = attr->value_size;
 	array->map.max_entries = attr->max_entries;
-	array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
 	array->elem_size = elem_size;
 
+	if (!percpu)
+		goto out;
+
+	array_size += (u64) attr->max_entries * elem_size * num_possible_cpus();
+
+	if (array_size >= U32_MAX - PAGE_SIZE ||
+	    elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
+		kvfree(array);
+		return ERR_PTR(-ENOMEM);
+	}
+out:
+	array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
+
 	return &array->map;
 }
 
@@ -67,12 +112,24 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 index = *(u32 *)key;
 
-	if (index >= array->map.max_entries)
+	if (unlikely(index >= array->map.max_entries))
 		return NULL;
 
 	return array->value + array->elem_size * index;
 }
 
+/* Called from eBPF program */
+static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	u32 index = *(u32 *)key;
+
+	if (unlikely(index >= array->map.max_entries))
+		return NULL;
+
+	return this_cpu_ptr(array->pptrs[index]);
+}
+
 /* Called from syscall */
 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
 {
@@ -99,19 +156,24 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	u32 index = *(u32 *)key;
 
-	if (map_flags > BPF_EXIST)
+	if (unlikely(map_flags > BPF_EXIST))
 		/* unknown flags */
 		return -EINVAL;
 
-	if (index >= array->map.max_entries)
+	if (unlikely(index >= array->map.max_entries))
 		/* all elements were pre-allocated, cannot insert a new one */
 		return -E2BIG;
 
-	if (map_flags == BPF_NOEXIST)
+	if (unlikely(map_flags == BPF_NOEXIST))
 		/* all elements already exist */
 		return -EEXIST;
 
-	memcpy(array->value + array->elem_size * index, value, map->value_size);
+	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+		memcpy(this_cpu_ptr(array->pptrs[index]),
+		       value, map->value_size);
+	else
+		memcpy(array->value + array->elem_size * index,
+		       value, map->value_size);
 	return 0;
 }
 
@@ -133,6 +195,9 @@ static void array_map_free(struct bpf_map *map)
 	 */
 	synchronize_rcu();
 
+	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+		bpf_array_free_percpu(array);
+
 	kvfree(array);
 }
 
@@ -150,9 +215,24 @@ static struct bpf_map_type_list array_type __read_mostly = {
 	.type = BPF_MAP_TYPE_ARRAY,
 };
 
+static const struct bpf_map_ops percpu_array_ops = {
+	.map_alloc = array_map_alloc,
+	.map_free = array_map_free,
+	.map_get_next_key = array_map_get_next_key,
+	.map_lookup_elem = percpu_array_map_lookup_elem,
+	.map_update_elem = array_map_update_elem,
+	.map_delete_elem = array_map_delete_elem,
+};
+
+static struct bpf_map_type_list percpu_array_type __read_mostly = {
+	.ops = &percpu_array_ops,
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+};
+
 static int __init register_array_map(void)
 {
 	bpf_register_map_type(&array_type);
+	bpf_register_map_type(&percpu_array_type);
 	return 0;
 }
 late_initcall(register_array_map);
-- 
cgit v1.2.3


From 67eb03318bc5fe170ae832423fda7a23b0d801cf Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 2 Feb 2016 07:43:45 -0800
Subject: net: Add support for fill_slave_info to VRF device

Allows userspace to have direct access to VRF table association
versus looking up master device and its table.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vrf.c            | 21 +++++++++++++++++++++
 include/uapi/linux/if_link.h |  8 ++++++++
 2 files changed, 29 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 66addb7a7911..76e1fc9d8748 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -877,6 +877,24 @@ static int vrf_fillinfo(struct sk_buff *skb,
 	return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id);
 }
 
+static size_t vrf_get_slave_size(const struct net_device *bond_dev,
+				 const struct net_device *slave_dev)
+{
+	return nla_total_size(sizeof(u32));  /* IFLA_VRF_PORT_TABLE */
+}
+
+static int vrf_fill_slave_info(struct sk_buff *skb,
+			       const struct net_device *vrf_dev,
+			       const struct net_device *slave_dev)
+{
+	struct net_vrf *vrf = netdev_priv(vrf_dev);
+
+	if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id))
+		return -EMSGSIZE;
+
+	return 0;
+}
+
 static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = {
 	[IFLA_VRF_TABLE] = { .type = NLA_U32 },
 };
@@ -890,6 +908,9 @@ static struct rtnl_link_ops vrf_link_ops __read_mostly = {
 	.validate	= vrf_validate,
 	.fill_info	= vrf_fillinfo,
 
+	.get_slave_size  = vrf_get_slave_size,
+	.fill_slave_info = vrf_fill_slave_info,
+
 	.newlink	= vrf_newlink,
 	.dellink	= vrf_dellink,
 	.setup		= vrf_setup,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d3e90b91e07e..d452cea59020 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -405,6 +405,14 @@ enum {
 
 #define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1)
 
+enum {
+	IFLA_VRF_PORT_UNSPEC,
+	IFLA_VRF_PORT_TABLE,
+	__IFLA_VRF_PORT_MAX
+};
+
+#define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1)
+
 /* IPVLAN section */
 enum {
 	IFLA_IPVLAN_UNSPEC,
-- 
cgit v1.2.3


From 103a8ad1fa3b261c78dfc842cb315defe9d40be0 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Wed, 3 Feb 2016 04:04:36 +0100
Subject: ethtool: add speed/duplex validation functions

Add functions which check if the speed/duplex are defined.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 57fa39005e79..b2e180181629 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1319,11 +1319,45 @@ enum ethtool_sfeatures_retval_bits {
 
 #define SPEED_UNKNOWN		-1
 
+static inline int ethtool_validate_speed(__u32 speed)
+{
+	switch (speed) {
+	case SPEED_10:
+	case SPEED_100:
+	case SPEED_1000:
+	case SPEED_2500:
+	case SPEED_5000:
+	case SPEED_10000:
+	case SPEED_20000:
+	case SPEED_25000:
+	case SPEED_40000:
+	case SPEED_50000:
+	case SPEED_56000:
+	case SPEED_100000:
+	case SPEED_UNKNOWN:
+		return 1;
+	}
+
+	return 0;
+}
+
 /* Duplex, half or full. */
 #define DUPLEX_HALF		0x00
 #define DUPLEX_FULL		0x01
 #define DUPLEX_UNKNOWN		0xff
 
+static inline int ethtool_validate_duplex(__u8 duplex)
+{
+	switch (duplex) {
+	case DUPLEX_HALF:
+	case DUPLEX_FULL:
+	case DUPLEX_UNKNOWN:
+		return 1;
+	}
+
+	return 0;
+}
+
 /* Which connector port. */
 #define PORT_TP			0x00
 #define PORT_AUI		0x01
-- 
cgit v1.2.3


From 8b37524962b9c54423374717786198f5c0820a28 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 8 Feb 2016 11:21:50 +1100
Subject: quota: add new quotactl Q_XGETNEXTQUOTA

Q_XGETNEXTQUOTA is exactly like Q_XGETQUOTA, except that it
will return quota information for the id equal to or greater
than the id requested.  In other words, if the requested id has
no quota, the command will return quota information for the
next higher id which does have a quota set.  If no higher id
has an active quota, -ESRCH is returned.

This allows filesystems to do efficient iteration in kernelspace,
much like extN filesystems do in userspace when asked to report
all active quotas.

The patch adds a d_id field to struct qc_dqblk so that we can
pass back the id of the quota which was found, and return it
to userspace.

Today, filesystems such as XFS require getpwent-style iterations,
and for systems which have i.e. LDAP backends, this can be very
slow, or even impossible if iteration is not allowed in the
configuration.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/quota/quota.c               | 31 +++++++++++++++++++++++++++++++
 include/linux/quota.h          |  2 ++
 include/uapi/linux/dqblk_xfs.h |  1 +
 3 files changed, 34 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ea6667083b3e..0a6dd71ea309 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -625,6 +625,34 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
 	return ret;
 }
 
+/*
+ * Return quota for next active quota >= this id, if any exists,
+ * otherwise return -ESRCH via ->get_nextdqblk.
+ */
+static int quota_getnextxquota(struct super_block *sb, int type, qid_t id,
+			    void __user *addr)
+{
+	struct fs_disk_quota fdq;
+	struct qc_dqblk qdq;
+	struct kqid qid;
+	qid_t id_out;
+	int ret;
+
+	if (!sb->s_qcop->get_nextdqblk)
+		return -ENOSYS;
+	qid = make_kqid(current_user_ns(), type, id);
+	if (!qid_valid(qid))
+		return -EINVAL;
+	ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq);
+	if (ret)
+		return ret;
+	id_out = from_kqid(current_user_ns(), qid);
+	copy_to_xfs_dqblk(&fdq, &qdq, type, id_out);
+	if (copy_to_user(addr, &fdq, sizeof(fdq)))
+		return -EFAULT;
+	return ret;
+}
+
 static int quota_rmxquota(struct super_block *sb, void __user *addr)
 {
 	__u32 flags;
@@ -690,6 +718,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 		return quota_setxquota(sb, type, id, addr);
 	case Q_XGETQUOTA:
 		return quota_getxquota(sb, type, id, addr);
+	case Q_XGETNEXTQUOTA:
+		return quota_getnextxquota(sb, type, id, addr);
 	case Q_XQUOTASYNC:
 		if (sb->s_flags & MS_RDONLY)
 			return -EROFS;
@@ -712,6 +742,7 @@ static int quotactl_cmd_write(int cmd)
 	case Q_XGETQSTAT:
 	case Q_XGETQSTATV:
 	case Q_XGETQUOTA:
+	case Q_XGETNEXTQUOTA:
 	case Q_XQUOTASYNC:
 		return 0;
 	}
diff --git a/include/linux/quota.h b/include/linux/quota.h
index b2505acfd3c0..fba92f5c1a63 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -425,6 +425,8 @@ struct quotactl_ops {
 	int (*quota_sync)(struct super_block *, int);
 	int (*set_info)(struct super_block *, int, struct qc_info *);
 	int (*get_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
+	int (*get_nextdqblk)(struct super_block *, struct kqid *,
+			     struct qc_dqblk *);
 	int (*set_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *);
 	int (*get_state)(struct super_block *, struct qc_state *);
 	int (*rm_xquota)(struct super_block *, unsigned int);
diff --git a/include/uapi/linux/dqblk_xfs.h b/include/uapi/linux/dqblk_xfs.h
index dcd75cc26196..11b3b31faf14 100644
--- a/include/uapi/linux/dqblk_xfs.h
+++ b/include/uapi/linux/dqblk_xfs.h
@@ -39,6 +39,7 @@
 #define Q_XQUOTARM	XQM_CMD(6)	/* free disk space used by dquots */
 #define Q_XQUOTASYNC	XQM_CMD(7)	/* delalloc flush, updates dquots */
 #define Q_XGETQSTATV	XQM_CMD(8)	/* newer version of get quota */
+#define Q_XGETNEXTQUOTA	XQM_CMD(9)	/* get disk limits and usage >= ID */
 
 /*
  * fs_disk_quota structure:
-- 
cgit v1.2.3


From 926132c0257a5a8d149a6a395cc3405e55420566 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Mon, 8 Feb 2016 11:22:21 +1100
Subject: quota: add new quotactl Q_GETNEXTQUOTA

Q_GETNEXTQUOTA is exactly like Q_GETQUOTA, except that it
will return quota information for the id equal to or greater
than the id requested.  In other words, if the requested id has
no quota, the command will return quota information for the
next higher id which does have a quota set.  If no higher id
has an active quota, -ESRCH is returned.

This allows filesystems to do efficient iteration in kernelspace,
much like extN filesystems do in userspace when asked to report
all active quotas.

This does require a new data structure for userspace, as the
current structure does not include an ID for the returned quota
information.

Today, Ext4 with a hidden quota inode requires getpwent-style
iterations, and for systems which have i.e. LDAP backends,
this can be very slow, or even impossible if iteration is not
allowed in the configuration.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
---
 fs/quota/quota.c           | 31 +++++++++++++++++++++++++++++++
 include/uapi/linux/quota.h | 14 ++++++++++++++
 2 files changed, 45 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 0a6dd71ea309..0ebc90496525 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -222,6 +222,34 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
 	return 0;
 }
 
+/*
+ * Return quota for next active quota >= this id, if any exists,
+ * otherwise return -ESRCH via ->get_nextdqblk
+ */
+static int quota_getnextquota(struct super_block *sb, int type, qid_t id,
+			  void __user *addr)
+{
+	struct kqid qid;
+	struct qc_dqblk fdq;
+	struct if_nextdqblk idq;
+	int ret;
+
+	if (!sb->s_qcop->get_nextdqblk)
+		return -ENOSYS;
+	qid = make_kqid(current_user_ns(), type, id);
+	if (!qid_valid(qid))
+		return -EINVAL;
+	ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq);
+	if (ret)
+		return ret;
+	/* struct if_nextdqblk is a superset of struct if_dqblk */
+	copy_to_if_dqblk((struct if_dqblk *)&idq, &fdq);
+	idq.dqb_id = from_kqid(current_user_ns(), qid);
+	if (copy_to_user(addr, &idq, sizeof(idq)))
+		return -EFAULT;
+	return 0;
+}
+
 static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
 {
 	dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
@@ -698,6 +726,8 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 		return quota_setinfo(sb, type, addr);
 	case Q_GETQUOTA:
 		return quota_getquota(sb, type, id, addr);
+	case Q_GETNEXTQUOTA:
+		return quota_getnextquota(sb, type, id, addr);
 	case Q_SETQUOTA:
 		return quota_setquota(sb, type, id, addr);
 	case Q_SYNC:
@@ -738,6 +768,7 @@ static int quotactl_cmd_write(int cmd)
 	switch (cmd) {
 	case Q_GETFMT:
 	case Q_GETINFO:
+	case Q_GETNEXTQUOTA:
 	case Q_SYNC:
 	case Q_XGETQSTAT:
 	case Q_XGETQSTATV:
diff --git a/include/uapi/linux/quota.h b/include/uapi/linux/quota.h
index 9c95b2c1c88a..38baddb807f5 100644
--- a/include/uapi/linux/quota.h
+++ b/include/uapi/linux/quota.h
@@ -71,6 +71,7 @@
 #define Q_SETINFO  0x800006	/* set information about quota files */
 #define Q_GETQUOTA 0x800007	/* get user quota structure */
 #define Q_SETQUOTA 0x800008	/* set user quota structure */
+#define Q_GETNEXTQUOTA 0x800009	/* get disk limits and usage >= ID */
 
 /* Quota format type IDs */
 #define	QFMT_VFS_OLD 1
@@ -119,6 +120,19 @@ struct if_dqblk {
 	__u32 dqb_valid;
 };
 
+struct if_nextdqblk {
+	__u64 dqb_bhardlimit;
+	__u64 dqb_bsoftlimit;
+	__u64 dqb_curspace;
+	__u64 dqb_ihardlimit;
+	__u64 dqb_isoftlimit;
+	__u64 dqb_curinodes;
+	__u64 dqb_btime;
+	__u64 dqb_itime;
+	__u32 dqb_valid;
+	__u32 dqb_id;
+};
+
 /*
  * Structure used for setting quota information about file via quotactl
  * Following flags are used to specify which fields are valid
-- 
cgit v1.2.3


From 157ede6784ba2837c7dc43f195418c75927f8488 Mon Sep 17 00:00:00 2001
From: Elad Raz <eladr@mellanox.com>
Date: Wed, 3 Feb 2016 09:57:04 +0100
Subject: bridge: mdb: add support for offloaded mdb entries

Add new bitmask member 'flags' to br_mdb_entry structure. Adding
MDB_FLAGS_OFFLOAD bit which indicates MDB entries is offloaded to hardware.

Signed-off-by: Elad Raz <eladr@mellanox.com>
Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 18db14477bdd..ec3547234998 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -183,6 +183,8 @@ struct br_mdb_entry {
 #define MDB_TEMPORARY 0
 #define MDB_PERMANENT 1
 	__u8 state;
+#define MDB_FLAGS_OFFLOAD	(1 << 0)
+	__u8 flags;
 	__u16 vid;
 	struct {
 		union {
-- 
cgit v1.2.3


From 3c702e9987e261042a07e43460a8148be254412e Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Wed, 21 Oct 2015 15:29:53 +0200
Subject: gpio: add a userspace chardev ABI for GPIOs

A new chardev that is to be used for userspace GPIO access is
added in this patch. It is intended to gradually replace the
horribly broken sysfs ABI.

Using a chardev has many upsides:

- All operations are per-gpiochip, which is the actual
  device underlying the GPIOs, making us tie in to the
  kernel device model properly.

- Hotpluggable GPIO controllers can come and go, as this
  kind of problem has been know to userspace for character
  devices since ages, and if a gpiochip handle is held in
  userspace we know we will break something, whereas the
  sysfs is stateless.

- The one-value-per-file rule of sysfs is really hard to
  maintain when you want to twist more than one knob at a time,
  for example have in-kernel APIs to switch several GPIO
  lines at the same time, and this will be possible to do
  with a single ioctl() from userspace, saving a lot of
  context switching.

We also need to add a new bus type for GPIO. This is
necessary for example for userspace coldplug, where sysfs is
traversed to find the boot-time device nodes and create the
character devices in /dev.

This new chardev ABI is *non* *optional* and can be counted
on to be present in the future, emphasizing the preference
of this ABI.

The ABI only implements one single ioctl() to get the name
and number of GPIO lines of a chip. Even this is debatable:
see it as a minimal example for review. This ABI shall be
ruthlessly reviewed and etched in stone.

The old /sys/class/gpio is still optional to compile in,
but will be deprecated.

Unique device IDs are created using IDR, which is overkill
and insanely scalable, but also well tested.

Cc: Johan Hovold <johan@kernel.org>
Cc: Michael Welling <mwelling@ieee.org>
Cc: Markus Pargmann <mpa@pengutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 MAINTAINERS               |   1 +
 drivers/gpio/gpiolib.c    | 125 +++++++++++++++++++++++++++++++++++++++++++++-
 drivers/gpio/gpiolib.h    |   2 +
 include/uapi/linux/Kbuild |   1 +
 include/uapi/linux/gpio.h |  28 +++++++++++
 5 files changed, 156 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/gpio.h

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 30aca4aa5467..76986c3ab4ff 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4819,6 +4819,7 @@ F:	drivers/gpio/
 F:	include/linux/gpio/
 F:	include/linux/gpio.h
 F:	include/asm-generic/gpio.h
+F:	include/uapi/linux/gpio.h
 
 GRE DEMULTIPLEXER DRIVER
 M:	Dmitry Kozlov <xeb@mail.ru>
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 4b94e31a50af..70e0fff0a8a7 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -17,6 +17,10 @@
 #include <linux/gpio/machine.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/idr.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/gpio.h>
 
 #include "gpiolib.h"
 
@@ -45,6 +49,11 @@
 
 /* Device and char device-related information */
 static DEFINE_IDA(gpio_ida);
+static dev_t gpio_devt;
+#define GPIO_DEV_MAX 256 /* 256 GPIO chip devices supported */
+static struct bus_type gpio_bus_type = {
+	.name = "gpio",
+};
 
 /* gpio_lock prevents conflicts during gpio_desc[] table updates.
  * While any GPIO is requested, its gpio_chip is not removable;
@@ -316,10 +325,84 @@ static int gpiochip_set_desc_names(struct gpio_chip *gc)
 	return 0;
 }
 
+/**
+ * gpio_ioctl() - ioctl handler for the GPIO chardev
+ */
+static long gpio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+	struct gpio_device *gdev = filp->private_data;
+	struct gpio_chip *chip = gdev->chip;
+	int __user *ip = (int __user *)arg;
+	struct gpiochip_info chipinfo;
+
+	/* We fail any subsequent ioctl():s when the chip is gone */
+	if (!chip)
+		return -ENODEV;
+
+	if (cmd == GPIO_GET_CHIPINFO_IOCTL) {
+		/* Fill in the struct and pass to userspace */
+		strncpy(chipinfo.name, dev_name(&gdev->dev),
+			sizeof(chipinfo.name));
+		chipinfo.name[sizeof(chipinfo.name)-1] = '\0';
+		chipinfo.lines = chip->ngpio;
+		if (copy_to_user(ip, &chipinfo, sizeof(chipinfo)))
+			return -EFAULT;
+		return 0;
+	}
+	return -EINVAL;
+}
+
+/**
+ * gpio_chrdev_open() - open the chardev for ioctl operations
+ * @inode: inode for this chardev
+ * @filp: file struct for storing private data
+ * Returns 0 on success
+ */
+static int gpio_chrdev_open(struct inode *inode, struct file *filp)
+{
+	struct gpio_device *gdev = container_of(inode->i_cdev,
+					      struct gpio_device, chrdev);
+
+	/* Fail on open if the backing gpiochip is gone */
+	if (!gdev || !gdev->chip)
+		return -ENODEV;
+	get_device(&gdev->dev);
+	filp->private_data = gdev;
+	return 0;
+}
+
+/**
+ * gpio_chrdev_release() - close chardev after ioctl operations
+ * @inode: inode for this chardev
+ * @filp: file struct for storing private data
+ * Returns 0 on success
+ */
+static int gpio_chrdev_release(struct inode *inode, struct file *filp)
+{
+	struct gpio_device *gdev = container_of(inode->i_cdev,
+					      struct gpio_device, chrdev);
+
+	if (!gdev)
+		return -ENODEV;
+	put_device(&gdev->dev);
+	return 0;
+}
+
+
+static const struct file_operations gpio_fileops = {
+	.release = gpio_chrdev_release,
+	.open = gpio_chrdev_open,
+	.owner = THIS_MODULE,
+	.llseek = noop_llseek,
+	.unlocked_ioctl = gpio_ioctl,
+	.compat_ioctl = gpio_ioctl,
+};
+
 static void gpiodevice_release(struct device *dev)
 {
 	struct gpio_device *gdev = dev_get_drvdata(dev);
 
+	cdev_del(&gdev->chrdev);
 	list_del(&gdev->list);
 	ida_simple_remove(&gpio_ida, gdev->id);
 }
@@ -357,6 +440,7 @@ int gpiochip_add_data(struct gpio_chip *chip, void *data)
 	gdev = kmalloc(sizeof(*gdev), GFP_KERNEL);
 	if (!gdev)
 		return -ENOMEM;
+	gdev->dev.bus = &gpio_bus_type;
 	gdev->chip = chip;
 	chip->gpiodev = gdev;
 	if (chip->parent) {
@@ -452,9 +536,26 @@ int gpiochip_add_data(struct gpio_chip *chip, void *data)
 
 	acpi_gpiochip_add(chip);
 
+	/*
+	 * By first adding the chardev, and then adding the device,
+	 * we get a device node entry in sysfs under
+	 * /sys/bus/gpio/devices/gpiochipN/dev that can be used for
+	 * coldplug of device nodes and other udev business.
+	 */
+	cdev_init(&gdev->chrdev, &gpio_fileops);
+	gdev->chrdev.owner = THIS_MODULE;
+	gdev->chrdev.kobj.parent = &gdev->dev.kobj;
+	gdev->dev.devt = MKDEV(MAJOR(gpio_devt), gdev->id);
+	status = cdev_add(&gdev->chrdev, gdev->dev.devt, 1);
+	if (status < 0)
+		chip_warn(chip, "failed to add char device %d:%d\n",
+			  MAJOR(gpio_devt), gdev->id);
+	else
+		chip_dbg(chip, "added GPIO chardev (%d:%d)\n",
+			 MAJOR(gpio_devt), gdev->id);
 	status = device_add(&gdev->dev);
 	if (status)
-		goto err_remove_chip;
+		goto err_remove_chardev;
 
 	status = gpiochip_sysfs_register(chip);
 	if (status)
@@ -471,6 +572,8 @@ int gpiochip_add_data(struct gpio_chip *chip, void *data)
 
 err_remove_device:
 	device_del(&gdev->dev);
+err_remove_chardev:
+	cdev_del(&gdev->chrdev);
 err_remove_chip:
 	acpi_gpiochip_remove(chip);
 	gpiochip_free_hogs(chip);
@@ -2543,6 +2646,26 @@ void gpiod_put_array(struct gpio_descs *descs)
 }
 EXPORT_SYMBOL_GPL(gpiod_put_array);
 
+static int __init gpiolib_dev_init(void)
+{
+	int ret;
+
+	/* Register GPIO sysfs bus */
+	ret  = bus_register(&gpio_bus_type);
+	if (ret < 0) {
+		pr_err("gpiolib: could not register GPIO bus type\n");
+		return ret;
+	}
+
+	ret = alloc_chrdev_region(&gpio_devt, 0, GPIO_DEV_MAX, "gpiochip");
+	if (ret < 0) {
+		pr_err("gpiolib: failed to allocate char dev region\n");
+		bus_unregister(&gpio_bus_type);
+	}
+	return ret;
+}
+core_initcall(gpiolib_dev_init);
+
 #ifdef CONFIG_DEBUG_FS
 
 static void gpiolib_dbg_show(struct seq_file *s, struct gpio_chip *chip)
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index 3f329c922f5b..1524ba0ca99d 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -26,6 +26,7 @@ struct acpi_device;
  * struct gpio_device - internal state container for GPIO devices
  * @id: numerical ID number for the GPIO chip
  * @dev: the GPIO device struct
+ * @chrdev: character device for the GPIO device
  * @owner: helps prevent removal of modules exporting active GPIOs
  * @chip: pointer to the corresponding gpiochip, holding static
  * data for this device
@@ -39,6 +40,7 @@ struct acpi_device;
 struct gpio_device {
 	int			id;
 	struct device		dev;
+	struct cdev		chrdev;
 	struct module		*owner;
 	struct gpio_chip	*chip;
 	struct list_head        list;
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index ebd10e624598..5c9ae6a9b7f5 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -138,6 +138,7 @@ header-y += genetlink.h
 header-y += gen_stats.h
 header-y += gfs2_ondisk.h
 header-y += gigaset_dev.h
+header-y += gpio.h
 header-y += gsmmux.h
 header-y += hdlcdrv.h
 header-y += hdlc.h
diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h
new file mode 100644
index 000000000000..3188a87bdaa0
--- /dev/null
+++ b/include/uapi/linux/gpio.h
@@ -0,0 +1,28 @@
+/*
+ * <linux/gpio.h> - userspace ABI for the GPIO character devices
+ *
+ * Copyright (C) 2015 Linus Walleij
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+#ifndef _UAPI_GPIO_H_
+#define _UAPI_GPIO_H_
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/**
+ * struct gpiochip_info - Information about a certain GPIO chip
+ * @name: the name of this GPIO chip
+ * @lines: number of GPIO lines on this chip
+ */
+struct gpiochip_info {
+	char name[32];
+	__u32 lines;
+};
+
+#define GPIO_GET_CHIPINFO_IOCTL _IOR('o', 0x01, struct gpiochip_info)
+
+#endif /* _UAPI_GPIO_H_ */
-- 
cgit v1.2.3


From 45cc29afb47f229014257068a3153276b2fb4c38 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hans.verkuil@cisco.com>
Date: Wed, 27 Jan 2016 11:31:39 -0200
Subject: [media] v4l2-ctrls: add V4L2_CID_DV_RX/TX_IT_CONTENT_TYPE controls

HDMI and DisplayPort both support IT Content Type information that
tells the receiver what type of material the video is, graphics such
as from a PC desktop, Photo, Cinema or Game (low-latency).

This patch adds controls for receivers and transmitters to get/set
this information.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 drivers/media/v4l2-core/v4l2-ctrls.c | 16 ++++++++++++++++
 include/uapi/linux/v4l2-controls.h   | 10 ++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index c9d5537b6af7..bed04b1fdd0a 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -462,6 +462,14 @@ const char * const *v4l2_ctrl_get_menu(u32 id)
 		"RGB full range (0-255)",
 		NULL,
 	};
+	static const char * const dv_it_content_type[] = {
+		"Graphics",
+		"Photo",
+		"Cinema",
+		"Game",
+		"No IT Content",
+		NULL,
+	};
 	static const char * const detect_md_mode[] = {
 		"Disabled",
 		"Global",
@@ -560,6 +568,9 @@ const char * const *v4l2_ctrl_get_menu(u32 id)
 	case V4L2_CID_DV_TX_RGB_RANGE:
 	case V4L2_CID_DV_RX_RGB_RANGE:
 		return dv_rgb_range;
+	case V4L2_CID_DV_TX_IT_CONTENT_TYPE:
+	case V4L2_CID_DV_RX_IT_CONTENT_TYPE:
+		return dv_it_content_type;
 	case V4L2_CID_DETECT_MD_MODE:
 		return detect_md_mode;
 
@@ -881,8 +892,10 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_DV_TX_EDID_PRESENT:	return "EDID Present";
 	case V4L2_CID_DV_TX_MODE:		return "Transmit Mode";
 	case V4L2_CID_DV_TX_RGB_RANGE:		return "Tx RGB Quantization Range";
+	case V4L2_CID_DV_TX_IT_CONTENT_TYPE:	return "Tx IT Content Type";
 	case V4L2_CID_DV_RX_POWER_PRESENT:	return "Power Present";
 	case V4L2_CID_DV_RX_RGB_RANGE:		return "Rx RGB Quantization Range";
+	case V4L2_CID_DV_RX_IT_CONTENT_TYPE:	return "Rx IT Content Type";
 
 	case V4L2_CID_FM_RX_CLASS:		return "FM Radio Receiver Controls";
 	case V4L2_CID_TUNE_DEEMPHASIS:		return "De-Emphasis";
@@ -1038,7 +1051,9 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_SCENE_MODE:
 	case V4L2_CID_DV_TX_MODE:
 	case V4L2_CID_DV_TX_RGB_RANGE:
+	case V4L2_CID_DV_TX_IT_CONTENT_TYPE:
 	case V4L2_CID_DV_RX_RGB_RANGE:
+	case V4L2_CID_DV_RX_IT_CONTENT_TYPE:
 	case V4L2_CID_TEST_PATTERN:
 	case V4L2_CID_TUNE_DEEMPHASIS:
 	case V4L2_CID_MPEG_VIDEO_VPX_GOLDEN_FRAME_SEL:
@@ -1185,6 +1200,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_DV_TX_RXSENSE:
 	case V4L2_CID_DV_TX_EDID_PRESENT:
 	case V4L2_CID_DV_RX_POWER_PRESENT:
+	case V4L2_CID_DV_RX_IT_CONTENT_TYPE:
 	case V4L2_CID_RDS_RX_PTY:
 	case V4L2_CID_RDS_RX_PS_NAME:
 	case V4L2_CID_RDS_RX_RADIO_TEXT:
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 2d225bcdb831..2ae5c3ea179b 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -912,8 +912,18 @@ enum v4l2_dv_rgb_range {
 	V4L2_DV_RGB_RANGE_FULL	  = 2,
 };
 
+#define V4L2_CID_DV_TX_IT_CONTENT_TYPE		(V4L2_CID_DV_CLASS_BASE + 6)
+enum v4l2_dv_it_content_type {
+	V4L2_DV_IT_CONTENT_TYPE_GRAPHICS  = 0,
+	V4L2_DV_IT_CONTENT_TYPE_PHOTO	  = 1,
+	V4L2_DV_IT_CONTENT_TYPE_CINEMA	  = 2,
+	V4L2_DV_IT_CONTENT_TYPE_GAME	  = 3,
+	V4L2_DV_IT_CONTENT_TYPE_NO_ITC	  = 4,
+};
+
 #define	V4L2_CID_DV_RX_POWER_PRESENT		(V4L2_CID_DV_CLASS_BASE + 100)
 #define V4L2_CID_DV_RX_RGB_RANGE		(V4L2_CID_DV_CLASS_BASE + 101)
+#define V4L2_CID_DV_RX_IT_CONTENT_TYPE		(V4L2_CID_DV_CLASS_BASE + 102)
 
 #define V4L2_CID_FM_RX_CLASS_BASE		(V4L2_CTRL_CLASS_FM_RX | 0x900)
 #define V4L2_CID_FM_RX_CLASS			(V4L2_CTRL_CLASS_FM_RX | 1)
-- 
cgit v1.2.3


From 634790b82759c98ee57c80966d859083fa2fcd8c Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 4 Nov 2015 16:33:33 +0100
Subject: KVM: s390: migration / injection of prog irq ilc

We have to migrate the program irq ilc and someday we will have to
specify the ilc without KVM trying to autodetect the value.

Let's reuse one of the spare fields in our program irq that should
always be set to 0 by user space. Because we also want to make use
of 0 ilcs ("not available"), we need a validity indicator.

If no valid ilc is given, we try to autodetect the ilc via the current
icptcode and icptstatus + parameter and store the valid ilc in the
irq structure.

This has a nice effect: QEMU's making use of KVM_S390_IRQ /
KVM_S390_SET_IRQ_STATE / KVM_S390_GET_IRQ_STATE for migration will
directly migrate the ilc without any changes.

Please note that we use bit 0 as validity and bit 1,2 for the ilc, so
by applying the ilc mask we directly get the ilen which is usually what
we work with.

Reviewed-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/interrupt.c | 12 +++++++++++-
 include/uapi/linux/kvm.h  |  7 ++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index daa4fdbcc91c..e594a7830022 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -571,7 +571,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
 	struct kvm_s390_pgm_info pgm_info;
 	int rc = 0, nullifying = false;
-	u16 ilen = kvm_s390_get_ilen(vcpu);
+	u16 ilen;
 
 	spin_lock(&li->lock);
 	pgm_info = li->irq.pgm;
@@ -579,6 +579,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
 	memset(&li->irq.pgm, 0, sizeof(pgm_info));
 	spin_unlock(&li->lock);
 
+	ilen = pgm_info.flags & KVM_S390_PGM_FLAGS_ILC_MASK;
 	VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilen:%d",
 		   pgm_info.code, ilen);
 	vcpu->stat.deliver_program_int++;
@@ -1043,8 +1044,16 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 	trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
 				   irq->u.pgm.code, 0);
 
+	if (!(irq->u.pgm.flags & KVM_S390_PGM_FLAGS_ILC_VALID)) {
+		/* auto detection if no valid ILC was given */
+		irq->u.pgm.flags &= ~KVM_S390_PGM_FLAGS_ILC_MASK;
+		irq->u.pgm.flags |= kvm_s390_get_ilen(vcpu);
+		irq->u.pgm.flags |= KVM_S390_PGM_FLAGS_ILC_VALID;
+	}
+
 	if (irq->u.pgm.code == PGM_PER) {
 		li->irq.pgm.code |= PGM_PER;
+		li->irq.pgm.flags = irq->u.pgm.flags;
 		/* only modify PER related information */
 		li->irq.pgm.per_address = irq->u.pgm.per_address;
 		li->irq.pgm.per_code = irq->u.pgm.per_code;
@@ -1053,6 +1062,7 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 	} else if (!(irq->u.pgm.code & PGM_PER)) {
 		li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) |
 				   irq->u.pgm.code;
+		li->irq.pgm.flags = irq->u.pgm.flags;
 		/* only modify non-PER information */
 		li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code;
 		li->irq.pgm.mon_code = irq->u.pgm.mon_code;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 9da905157cee..4e20a40bb10f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -541,7 +541,12 @@ struct kvm_s390_pgm_info {
 	__u8 exc_access_id;
 	__u8 per_access_id;
 	__u8 op_access_id;
-	__u8 pad[3];
+#define KVM_S390_PGM_FLAGS_ILC_VALID	0x01
+#define KVM_S390_PGM_FLAGS_ILC_0	0x02
+#define KVM_S390_PGM_FLAGS_ILC_1	0x04
+#define KVM_S390_PGM_FLAGS_ILC_MASK	0x06
+	__u8 flags;
+	__u8 pad[2];
 };
 
 struct kvm_s390_prefix_info {
-- 
cgit v1.2.3


From eaa4f41642f096f1e10c15a2b172d79199e893ff Mon Sep 17 00:00:00 2001
From: David Hildenbrand <dahi@linux.vnet.ibm.com>
Date: Wed, 4 Nov 2015 16:46:55 +0100
Subject: KVM: s390: irq delivery should not rely on icptcode

Program irq injection during program irq intercepts is the last candidates
that injects nullifying irqs and relies on delivery to do the right thing.

As we should not rely on the icptcode during any delivery (because that
value will not be migrated), let's add a flag, telling prog IRQ delivery
to not rewind the PSW in case of nullifying prog IRQs.

Signed-off-by: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
---
 arch/s390/kvm/intercept.c | 2 ++
 arch/s390/kvm/interrupt.c | 2 +-
 include/uapi/linux/kvm.h  | 1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 6b4e5b5ff06c..2e6b54e4d3f9 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -140,6 +140,8 @@ static int inject_prog_on_prog_intercept(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_pgm_info pgm_info = {
 		.code = vcpu->arch.sie_block->iprcc,
+		/* the PSW has already been rewound */
+		.flags = KVM_S390_PGM_FLAGS_NO_REWIND,
 	};
 
 	switch (vcpu->arch.sie_block->iprcc & ~PGM_PER) {
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index e594a7830022..87e2d1a89d74 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -665,7 +665,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
 				   (u8 *) __LC_PER_ACCESS_ID);
 	}
 
-	if (nullifying && vcpu->arch.sie_block->icptcode == ICPT_INST)
+	if (nullifying && !(pgm_info.flags & KVM_S390_PGM_FLAGS_NO_REWIND))
 		kvm_s390_rewind_psw(vcpu, ilen);
 
 	/* bit 1+2 of the target are the ilc, so we can directly use ilen */
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 4e20a40bb10f..a2fe0ac1d61a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -545,6 +545,7 @@ struct kvm_s390_pgm_info {
 #define KVM_S390_PGM_FLAGS_ILC_0	0x02
 #define KVM_S390_PGM_FLAGS_ILC_1	0x04
 #define KVM_S390_PGM_FLAGS_ILC_MASK	0x06
+#define KVM_S390_PGM_FLAGS_NO_REWIND	0x08
 	__u8 flags;
 	__u8 pad[2];
 };
-- 
cgit v1.2.3


From 12b74dfadb5a7a23baf4db941dc9fd9d371f249a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:17 +0100
Subject: ipv4: add option to drop unicast encapsulated in L2 multicast

In order to solve a problem with 802.11, the so-called hole-196 attack,
add an option (sysctl) called "drop_unicast_in_l2_multicast" which, if
enabled, causes the stack to drop IPv4 unicast packets encapsulated in
link-layer multi- or broadcast frames. Such frames can (as an attack)
be created by any member of the same wireless network and transmitted
as valid encrypted frames since the symmetric key for broadcast frames
is shared between all stations.

Additionally, enabling this option provides compliance with a SHOULD
clause of RFC 1122.

Reviewed-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  7 +++++++
 include/uapi/linux/ip.h                |  1 +
 net/ipv4/devinet.c                     |  2 ++
 net/ipv4/ip_input.c                    | 25 ++++++++++++++++++++++++-
 4 files changed, 34 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 73b36d7c7b0d..d5910d63214d 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1216,6 +1216,13 @@ promote_secondaries - BOOLEAN
 	promote a corresponding secondary IP address instead of
 	removing all the corresponding secondary IP addresses.
 
+drop_unicast_in_l2_multicast - BOOLEAN
+	Drop any unicast IP packets that are received in link-layer
+	multicast (or broadcast) frames.
+	This behavior (for multicast) is actually a SHOULD in RFC
+	1122, but is disabled by default for compatibility reasons.
+	Default: off (0)
+
 
 tag - INTEGER
 	Allows you to write a number, which can be used as required.
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 08f894d2ddbd..584834f7e95c 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -165,6 +165,7 @@ enum
 	IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
+	IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	__IPV4_DEVCONF_MAX
 };
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index cebd9d31e65a..dbbab28a52a4 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2192,6 +2192,8 @@ static struct devinet_sysctl_table {
 					      "promote_secondaries"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
 					      "route_localnet"),
+		DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST,
+					      "drop_unicast_in_l2_multicast"),
 	},
 };
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d77eb0c3b684..852002f64c68 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -362,8 +362,31 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	rt = skb_rtable(skb);
 	if (rt->rt_type == RTN_MULTICAST) {
 		IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len);
-	} else if (rt->rt_type == RTN_BROADCAST)
+	} else if (rt->rt_type == RTN_BROADCAST) {
 		IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len);
+	} else if (skb->pkt_type == PACKET_BROADCAST ||
+		   skb->pkt_type == PACKET_MULTICAST) {
+		struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+
+		/* RFC 1122 3.3.6:
+		 *
+		 *   When a host sends a datagram to a link-layer broadcast
+		 *   address, the IP destination address MUST be a legal IP
+		 *   broadcast or IP multicast address.
+		 *
+		 *   A host SHOULD silently discard a datagram that is received
+		 *   via a link-layer broadcast (see Section 2.4) but does not
+		 *   specify an IP multicast or broadcast destination address.
+		 *
+		 * This doesn't explicitly say L2 *broadcast*, but broadcast is
+		 * in a way a form of multicast and the most common use case for
+		 * this is 802.11 protecting against cross-station spoofing (the
+		 * so-called "hole-196" attack) so do it for both.
+		 */
+		if (in_dev &&
+		    IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
+			goto drop;
+	}
 
 	return dst_input(skb);
 
-- 
cgit v1.2.3


From 97daf331455077645ae1f13438bebd3d1a2e94ee Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:18 +0100
Subject: ipv4: add option to drop gratuitous ARP packets

In certain 802.11 wireless deployments, there will be ARP proxies
that use knowledge of the network to correctly answer requests.
To prevent gratuitous ARP frames on the shared medium from being
a problem, on such deployments wireless needs to drop them.

Enable this by providing an option called "drop_gratuitous_arp".

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 6 ++++++
 include/uapi/linux/ip.h                | 1 +
 net/ipv4/arp.c                         | 8 ++++++++
 net/ipv4/devinet.c                     | 2 ++
 4 files changed, 17 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index d5910d63214d..a53bbfaff1c7 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1223,6 +1223,12 @@ drop_unicast_in_l2_multicast - BOOLEAN
 	1122, but is disabled by default for compatibility reasons.
 	Default: off (0)
 
+drop_gratuitous_arp - BOOLEAN
+	Drop all gratuitous ARP frames, for example if there's a known
+	good ARP proxy on the network and such frames need not be used
+	(or in the case of 802.11, must not be used to prevent attacks.)
+	Default: off (0)
+
 
 tag - INTEGER
 	Allows you to write a number, which can be used as required.
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 584834f7e95c..f291569768dd 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -166,6 +166,7 @@ enum
 	IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL,
 	IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
+	IPV4_DEVCONF_DROP_GRATUITOUS_ARP,
 	__IPV4_DEVCONF_MAX
 };
 
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 59b3e0e8fd51..c102eb5ac55c 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -735,6 +735,14 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
 	    (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
 		goto out;
 
+ /*
+  *	For some 802.11 wireless deployments (and possibly other networks),
+  *	there will be an ARP proxy and gratuitous ARP frames are attacks
+  *	and thus should not be accepted.
+  */
+	if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
+		goto out;
+
 /*
  *     Special case: We must set Frame Relay source Q.922 address
  */
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index dbbab28a52a4..3d835313575e 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -2185,6 +2185,8 @@ static struct devinet_sysctl_table {
 					"igmpv3_unsolicited_report_interval"),
 		DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
 					"ignore_routes_with_linkdown"),
+		DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP,
+					"drop_gratuitous_arp"),
 
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
-- 
cgit v1.2.3


From abbc30436d39dfed8ebfca338d253f211ac7b094 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:19 +0100
Subject: ipv6: add option to drop unicast encapsulated in L2 multicast

In order to solve a problem with 802.11, the so-called hole-196 attack,
add an option (sysctl) called "drop_unicast_in_l2_multicast" which, if
enabled, causes the stack to drop IPv6 unicast packets encapsulated in
link-layer multi- or broadcast frames. Such frames can (as an attack)
be created by any member of the same wireless network and transmitted
as valid encrypted frames since the symmetric key for broadcast frames
is shared between all stations.

Reviewed-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  6 ++++++
 include/linux/ipv6.h                   |  1 +
 include/uapi/linux/ipv6.h              |  1 +
 net/ipv6/addrconf.c                    |  8 ++++++++
 net/ipv6/ip6_input.c                   | 10 ++++++++++
 5 files changed, 26 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index a53bbfaff1c7..e0e7350a4e6a 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1674,6 +1674,12 @@ stable_secret - IPv6 address
 
 	By default the stable secret is unset.
 
+drop_unicast_in_l2_multicast - BOOLEAN
+	Drop any unicast IPv6 packets that are received in link-layer
+	multicast (or broadcast) frames.
+
+	By default this is turned off.
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 402753bccafa..4a4c1ae826cb 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -50,6 +50,7 @@ struct ipv6_devconf {
 	__s32		mc_forwarding;
 #endif
 	__s32		disable_ipv6;
+	__s32		drop_unicast_in_l2_multicast;
 	__s32		accept_dad;
 	__s32		force_tllao;
 	__s32           ndisc_notify;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 38b4fef20219..4c413570efe8 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -174,6 +174,7 @@ enum {
 	DEVCONF_USE_OIF_ADDRS_ONLY,
 	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
 	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
+	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 38eeddedfc21..23e325f39f8e 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4711,6 +4711,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown;
 	/* we omit DEVCONF_STABLE_SECRET for now */
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
+	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5784,6 +5785,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= addrconf_sysctl_ignore_routes_with_linkdown,
 		},
+		{
+			.procname	= "drop_unicast_in_l2_multicast",
+			.data		= &ipv6_devconf.drop_unicast_in_l2_multicast,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			/* sentinel */
 		}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 9075acf081dd..31ac3c56da4b 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -134,6 +134,16 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 	    IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1)
 		goto err;
 
+	/* If enabled, drop unicast packets that were encapsulated in link-layer
+	 * multicast or broadcast to protected against the so-called "hole-196"
+	 * attack in 802.11 wireless.
+	 */
+	if (!ipv6_addr_is_multicast(&hdr->daddr) &&
+	    (skb->pkt_type == PACKET_BROADCAST ||
+	     skb->pkt_type == PACKET_MULTICAST) &&
+	    idev->cnf.drop_unicast_in_l2_multicast)
+		goto err;
+
 	/* RFC4291 2.7
 	 * Nodes must not originate a packet to a multicast address whose scope
 	 * field contains the reserved value 0; if such a packet is received, it
-- 
cgit v1.2.3


From 7a02bf892d8f1e5298af1676f001bee410509d80 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 4 Feb 2016 13:31:20 +0100
Subject: ipv6: add option to drop unsolicited neighbor advertisements

In certain 802.11 wireless deployments, there will be NA proxies
that use knowledge of the network to correctly answer requests.
To prevent unsolicitd advertisements on the shared medium from
being a problem, on such deployments wireless needs to drop them.

Enable this by providing an option called "drop_unsolicited_na".

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 7 +++++++
 include/linux/ipv6.h                   | 1 +
 include/uapi/linux/ipv6.h              | 1 +
 net/ipv6/addrconf.c                    | 8 ++++++++
 net/ipv6/ndisc.c                       | 9 +++++++++
 5 files changed, 26 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index e0e7350a4e6a..24ce97f42d35 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1680,6 +1680,13 @@ drop_unicast_in_l2_multicast - BOOLEAN
 
 	By default this is turned off.
 
+drop_unsolicited_na - BOOLEAN
+	Drop all unsolicited neighbor advertisements, for example if there's
+	a known good NA proxy on the network and such frames need not be used
+	(or in the case of 802.11, must not be used to prevent attacks.)
+
+	By default this is turned off.
+
 icmp/*:
 ratelimit - INTEGER
 	Limit the maximal rates for sending ICMPv6 packets.
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 4a4c1ae826cb..4b2267e1b7c3 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -56,6 +56,7 @@ struct ipv6_devconf {
 	__s32           ndisc_notify;
 	__s32		suppress_frag_ndisc;
 	__s32		accept_ra_mtu;
+	__s32		drop_unsolicited_na;
 	struct ipv6_stable_secret {
 		bool initialized;
 		struct in6_addr secret;
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 4c413570efe8..ec117b65d5a5 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -175,6 +175,7 @@ enum {
 	DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT,
 	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
+	DEVCONF_DROP_UNSOLICITED_NA,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 23e325f39f8e..ac0ba9e4e06b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4712,6 +4712,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	/* we omit DEVCONF_STABLE_SECRET for now */
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
+	array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5792,6 +5793,13 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname	= "drop_unsolicited_na",
+			.data		= &ipv6_devconf.drop_unsolicited_na,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= proc_dointvec,
+		},
 		{
 			/* sentinel */
 		}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 84afb9a77278..c245895a3d41 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -883,6 +883,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
 				    offsetof(struct nd_msg, opt));
 	struct ndisc_options ndopts;
 	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = __in6_dev_get(dev);
 	struct inet6_ifaddr *ifp;
 	struct neighbour *neigh;
 
@@ -902,6 +903,14 @@ static void ndisc_recv_na(struct sk_buff *skb)
 		return;
 	}
 
+	/* For some 802.11 wireless deployments (and possibly other networks),
+	 * there will be a NA proxy and unsolicitd packets are attacks
+	 * and thus should not be accepted.
+	 */
+	if (!msg->icmph.icmp6_solicited && idev &&
+	    idev->cnf.drop_unsolicited_na)
+		return;
+
 	if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
 		ND_PRINTK(2, warn, "NS: invalid ND option\n");
 		return;
-- 
cgit v1.2.3


From 72bb68721f80a1441e871b6afc9ab0b3793d5031 Mon Sep 17 00:00:00 2001
From: Edward Cree <ecree@solarflare.com>
Date: Fri, 5 Feb 2016 11:16:21 +0000
Subject: ethtool: add IPv6 to the NFC API

Signed-off-by: Edward Cree <ecree@solarflare.com>
Reviewed-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 70 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index b2e180181629..5e0940dcbfe8 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -748,6 +748,56 @@ struct ethtool_usrip4_spec {
 	__u8    proto;
 };
 
+/**
+ * struct ethtool_tcpip6_spec - flow specification for TCP/IPv6 etc.
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @psrc: Source port
+ * @pdst: Destination port
+ * @tclass: Traffic Class
+ *
+ * This can be used to specify a TCP/IPv6, UDP/IPv6 or SCTP/IPv6 flow.
+ */
+struct ethtool_tcpip6_spec {
+	__be32	ip6src[4];
+	__be32	ip6dst[4];
+	__be16	psrc;
+	__be16	pdst;
+	__u8    tclass;
+};
+
+/**
+ * struct ethtool_ah_espip6_spec - flow specification for IPsec/IPv6
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @spi: Security parameters index
+ * @tclass: Traffic Class
+ *
+ * This can be used to specify an IPsec transport or tunnel over IPv6.
+ */
+struct ethtool_ah_espip6_spec {
+	__be32	ip6src[4];
+	__be32	ip6dst[4];
+	__be32	spi;
+	__u8    tclass;
+};
+
+/**
+ * struct ethtool_usrip6_spec - general flow specification for IPv6
+ * @ip6src: Source host
+ * @ip6dst: Destination host
+ * @l4_4_bytes: First 4 bytes of transport (layer 4) header
+ * @tclass: Traffic Class
+ * @l4_proto: Transport protocol number (nexthdr after any Extension Headers)
+ */
+struct ethtool_usrip6_spec {
+	__be32	ip6src[4];
+	__be32	ip6dst[4];
+	__be32	l4_4_bytes;
+	__u8    tclass;
+	__u8    l4_proto;
+};
+
 union ethtool_flow_union {
 	struct ethtool_tcpip4_spec		tcp_ip4_spec;
 	struct ethtool_tcpip4_spec		udp_ip4_spec;
@@ -755,6 +805,12 @@ union ethtool_flow_union {
 	struct ethtool_ah_espip4_spec		ah_ip4_spec;
 	struct ethtool_ah_espip4_spec		esp_ip4_spec;
 	struct ethtool_usrip4_spec		usr_ip4_spec;
+	struct ethtool_tcpip6_spec		tcp_ip6_spec;
+	struct ethtool_tcpip6_spec		udp_ip6_spec;
+	struct ethtool_tcpip6_spec		sctp_ip6_spec;
+	struct ethtool_ah_espip6_spec		ah_ip6_spec;
+	struct ethtool_ah_espip6_spec		esp_ip6_spec;
+	struct ethtool_usrip6_spec		usr_ip6_spec;
 	struct ethhdr				ether_spec;
 	__u8					hdata[52];
 };
@@ -1401,15 +1457,17 @@ static inline int ethtool_validate_duplex(__u8 duplex)
 #define	UDP_V4_FLOW	0x02	/* hash or spec (udp_ip4_spec) */
 #define	SCTP_V4_FLOW	0x03	/* hash or spec (sctp_ip4_spec) */
 #define	AH_ESP_V4_FLOW	0x04	/* hash only */
-#define	TCP_V6_FLOW	0x05	/* hash only */
-#define	UDP_V6_FLOW	0x06	/* hash only */
-#define	SCTP_V6_FLOW	0x07	/* hash only */
+#define	TCP_V6_FLOW	0x05	/* hash or spec (tcp_ip6_spec; nfc only) */
+#define	UDP_V6_FLOW	0x06	/* hash or spec (udp_ip6_spec; nfc only) */
+#define	SCTP_V6_FLOW	0x07	/* hash or spec (sctp_ip6_spec; nfc only) */
 #define	AH_ESP_V6_FLOW	0x08	/* hash only */
 #define	AH_V4_FLOW	0x09	/* hash or spec (ah_ip4_spec) */
 #define	ESP_V4_FLOW	0x0a	/* hash or spec (esp_ip4_spec) */
-#define	AH_V6_FLOW	0x0b	/* hash only */
-#define	ESP_V6_FLOW	0x0c	/* hash only */
-#define	IP_USER_FLOW	0x0d	/* spec only (usr_ip4_spec) */
+#define	AH_V6_FLOW	0x0b	/* hash or spec (ah_ip6_spec; nfc only) */
+#define	ESP_V6_FLOW	0x0c	/* hash or spec (esp_ip6_spec; nfc only) */
+#define	IPV4_USER_FLOW	0x0d	/* spec only (usr_ip4_spec) */
+#define	IP_USER_FLOW	IPV4_USER_FLOW
+#define	IPV6_USER_FLOW	0x0e	/* spec only (usr_ip6_spec; nfc only) */
 #define	IPV4_FLOW	0x10	/* hash only */
 #define	IPV6_FLOW	0x11	/* hash only */
 #define	ETHER_FLOW	0x12	/* spec only (ether_spec) */
-- 
cgit v1.2.3


From 4456ed04ea44b800d691b18c14a68ec9894d2aca Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 7 Feb 2016 23:27:55 +0200
Subject: ethtool: future-proof interface for speed extensions

Many virtual and not quite virtual devices allow any speed to be set
through ethtool. In particular, this applies to the virtio-net devices.
Document this fact to make sure people don't assume the enum lists all
possible values.  Reserve values greater than INT_MAX for future
extension and to avoid conflict with SPEED_UNKNOWN.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 5e0940dcbfe8..4345f80a2e33 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -31,7 +31,7 @@
  *	physical connectors and other link features that are
  *	advertised through autonegotiation or enabled for
  *	auto-detection.
- * @speed: Low bits of the speed
+ * @speed: Low bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN
  * @duplex: Duplex mode; one of %DUPLEX_*
  * @port: Physical connector type; one of %PORT_*
  * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not
@@ -47,7 +47,7 @@
  *	obsoleted by &struct ethtool_coalesce.  Read-only; deprecated.
  * @maxrxpkt: Historically used to report RX IRQ coalescing; now
  *	obsoleted by &struct ethtool_coalesce.  Read-only; deprecated.
- * @speed_hi: High bits of the speed
+ * @speed_hi: High bits of the speed, 1Mb units, 0 to INT_MAX or SPEED_UNKNOWN
  * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of
  *	%ETH_TP_MDI_*.  If the status is unknown or not applicable, the
  *	value will be %ETH_TP_MDI_INVALID.  Read-only.
@@ -1359,7 +1359,7 @@ enum ethtool_sfeatures_retval_bits {
  * it was forced up into this mode or autonegotiated.
  */
 
-/* The forced speed, 10Mb, 100Mb, gigabit, [2.5|5|10|20|25|40|50|56|100]GbE. */
+/* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. */
 #define SPEED_10		10
 #define SPEED_100		100
 #define SPEED_1000		1000
-- 
cgit v1.2.3


From 4a92602aa1cd5bbaeedbd9536ff992f7d26fe9d1 Mon Sep 17 00:00:00 2001
From: Tycho Andersen <tycho.andersen@canonical.com>
Date: Fri, 5 Feb 2016 09:20:52 -0700
Subject: openvswitch: allow management from inside user namespaces

Operations with the GENL_ADMIN_PERM flag fail permissions checks because
this flag means we call netlink_capable, which uses the init user ns.

Instead, let's introduce a new flag, GENL_UNS_ADMIN_PERM for operations
which should be allowed inside a user namespace.

The motivation for this is to be able to run openvswitch in unprivileged
containers. I've tested this and it seems to work, but I really have no
idea about the security consequences of this patch, so thoughts would be
much appreciated.

v2: use the GENL_UNS_ADMIN_PERM flag instead of a check in each function
v3: use separate ifs for UNS_ADMIN_PERM and ADMIN_PERM, instead of one
    massive one

Reported-by: James Page <james.page@canonical.com>
Signed-off-by: Tycho Andersen <tycho.andersen@canonical.com>
CC: Eric Biederman <ebiederm@xmission.com>
CC: Pravin Shelar <pshelar@ovn.org>
CC: Justin Pettit <jpettit@nicira.com>
CC: "David S. Miller" <davem@davemloft.net>
Acked-by: Pravin B Shelar <pshelar@ovn.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/genetlink.h |  1 +
 net/netlink/genetlink.c        |  4 ++++
 net/openvswitch/datapath.c     | 20 ++++++++++----------
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/genetlink.h b/include/uapi/linux/genetlink.h
index c3363ba1ae05..5512c90af7e3 100644
--- a/include/uapi/linux/genetlink.h
+++ b/include/uapi/linux/genetlink.h
@@ -21,6 +21,7 @@ struct genlmsghdr {
 #define GENL_CMD_CAP_DO		0x02
 #define GENL_CMD_CAP_DUMP	0x04
 #define GENL_CMD_CAP_HASPOL	0x08
+#define GENL_UNS_ADMIN_PERM	0x10
 
 /*
  * List of reserved static generic netlink identifiers:
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index f830326b3b1d..0ffd721126e7 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -580,6 +580,10 @@ static int genl_family_rcv_msg(struct genl_family *family,
 	    !netlink_capable(skb, CAP_NET_ADMIN))
 		return -EPERM;
 
+	if ((ops->flags & GENL_UNS_ADMIN_PERM) &&
+	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+		return -EPERM;
+
 	if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
 		int rc;
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index deadfdab1bc3..d6f7fe92744a 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -654,7 +654,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_packet_genl_ops[] = {
 	{ .cmd = OVS_PACKET_CMD_EXECUTE,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = packet_policy,
 	  .doit = ovs_packet_cmd_execute
 	}
@@ -1391,12 +1391,12 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_flow_genl_ops[] = {
 	{ .cmd = OVS_FLOW_CMD_NEW,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = flow_policy,
 	  .doit = ovs_flow_cmd_new
 	},
 	{ .cmd = OVS_FLOW_CMD_DEL,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = flow_policy,
 	  .doit = ovs_flow_cmd_del
 	},
@@ -1407,7 +1407,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
 	  .dumpit = ovs_flow_cmd_dump
 	},
 	{ .cmd = OVS_FLOW_CMD_SET,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = flow_policy,
 	  .doit = ovs_flow_cmd_set,
 	},
@@ -1777,12 +1777,12 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_datapath_genl_ops[] = {
 	{ .cmd = OVS_DP_CMD_NEW,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = datapath_policy,
 	  .doit = ovs_dp_cmd_new
 	},
 	{ .cmd = OVS_DP_CMD_DEL,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = datapath_policy,
 	  .doit = ovs_dp_cmd_del
 	},
@@ -1793,7 +1793,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
 	  .dumpit = ovs_dp_cmd_dump
 	},
 	{ .cmd = OVS_DP_CMD_SET,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = datapath_policy,
 	  .doit = ovs_dp_cmd_set,
 	},
@@ -2158,12 +2158,12 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
 
 static const struct genl_ops dp_vport_genl_ops[] = {
 	{ .cmd = OVS_VPORT_CMD_NEW,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = vport_policy,
 	  .doit = ovs_vport_cmd_new
 	},
 	{ .cmd = OVS_VPORT_CMD_DEL,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = vport_policy,
 	  .doit = ovs_vport_cmd_del
 	},
@@ -2174,7 +2174,7 @@ static const struct genl_ops dp_vport_genl_ops[] = {
 	  .dumpit = ovs_vport_cmd_dump
 	},
 	{ .cmd = OVS_VPORT_CMD_SET,
-	  .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+	  .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
 	  .policy = vport_policy,
 	  .doit = ovs_vport_cmd_set,
 	},
-- 
cgit v1.2.3


From e02564ee334a7ae46b71fc18576391cb9455433e Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Sun, 7 Feb 2016 21:52:23 +0100
Subject: ethtool: make validate_speed accept all speeds between 0 and INT_MAX

Devices these days can have any speed and as was recently pointed out
any speed from 0 to INT_MAX is valid so adjust speed validation to
accept such values.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 4345f80a2e33..190aea0faaf4 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1377,24 +1377,7 @@ enum ethtool_sfeatures_retval_bits {
 
 static inline int ethtool_validate_speed(__u32 speed)
 {
-	switch (speed) {
-	case SPEED_10:
-	case SPEED_100:
-	case SPEED_1000:
-	case SPEED_2500:
-	case SPEED_5000:
-	case SPEED_10000:
-	case SPEED_20000:
-	case SPEED_25000:
-	case SPEED_40000:
-	case SPEED_50000:
-	case SPEED_56000:
-	case SPEED_100000:
-	case SPEED_UNKNOWN:
-		return 1;
-	}
-
-	return 0;
+	return speed <= INT_MAX || speed == SPEED_UNKNOWN;
 }
 
 /* Duplex, half or full. */
-- 
cgit v1.2.3


From c11e391da2a8fe973c3c2398452000bed505851e Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Thu, 11 Feb 2016 20:04:51 -0200
Subject: dma-buf: Add ioctls to allow userspace to flush
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The userspace might need some sort of cache coherency management e.g. when CPU
and GPU domains are being accessed through dma-buf at the same time. To
circumvent this problem there are begin/end coherency markers, that forward
directly to existing dma-buf device drivers vfunc hooks. Userspace can make use
of those markers through the DMA_BUF_IOCTL_SYNC ioctl. The sequence would be
used like following:
     - mmap dma-buf fd
     - for each drawing/upload cycle in CPU 1. SYNC_START ioctl, 2. read/write
       to mmap area 3. SYNC_END ioctl. This can be repeated as often as you
       want (with the new data being consumed by the GPU or say scanout device)
     - munmap once you don't need the buffer any more

v2 (Tiago): Fix header file type names (u64 -> __u64)
v3 (Tiago): Add documentation. Use enum dma_buf_sync_flags to the begin/end
dma-buf functions. Check for overflows in start/length.
v4 (Tiago): use 2d regions for sync.
v5 (Tiago): forget about 2d regions (v4); use _IOW in DMA_BUF_IOCTL_SYNC and
remove range information from struct dma_buf_sync.
v6 (Tiago): use __u64 structured padded flags instead enum. Adjust
documentation about the recommendation on using sync ioctls.
v7 (Tiago): Alex' nit on flags definition and being even more wording in the
doc about sync usage.
v9 (Tiago): remove useless is_dma_buf_file check. Fix sync.flags conditionals
and its mask order check. Add <linux/types.h> include in dma-buf.h.

Cc: Ville Syrjälä <ville.syrjala@linux.intel.com>
Cc: David Herrmann <dh.herrmann@gmail.com>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Reviewed-by: Stéphane Marchesin <marcheu@chromium.org>
Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Signed-off-by: Tiago Vignatti <tiago.vignatti@intel.com>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Link: http://patchwork.freedesktop.org/patch/msgid/1455228291-29640-1-git-send-email-tiago.vignatti@intel.com
---
 Documentation/dma-buf-sharing.txt | 21 +++++++++++++++++-
 drivers/dma-buf/dma-buf.c         | 45 +++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/dma-buf.h      | 40 ++++++++++++++++++++++++++++++++++
 3 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/dma-buf.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/dma-buf-sharing.txt b/Documentation/dma-buf-sharing.txt
index 4f4a84b6903a..32ac32e773e1 100644
--- a/Documentation/dma-buf-sharing.txt
+++ b/Documentation/dma-buf-sharing.txt
@@ -350,7 +350,26 @@ Being able to mmap an export dma-buf buffer object has 2 main use-cases:
    handles, too). So it's beneficial to support this in a similar fashion on
    dma-buf to have a good transition path for existing Android userspace.
 
-   No special interfaces, userspace simply calls mmap on the dma-buf fd.
+   No special interfaces, userspace simply calls mmap on the dma-buf fd, making
+   sure that the cache synchronization ioctl (DMA_BUF_IOCTL_SYNC) is *always*
+   used when the access happens. This is discussed next paragraphs.
+
+   Some systems might need some sort of cache coherency management e.g. when
+   CPU and GPU domains are being accessed through dma-buf at the same time. To
+   circumvent this problem there are begin/end coherency markers, that forward
+   directly to existing dma-buf device drivers vfunc hooks. Userspace can make
+   use of those markers through the DMA_BUF_IOCTL_SYNC ioctl. The sequence
+   would be used like following:
+     - mmap dma-buf fd
+     - for each drawing/upload cycle in CPU 1. SYNC_START ioctl, 2. read/write
+       to mmap area 3. SYNC_END ioctl. This can be repeated as often as you
+       want (with the new data being consumed by the GPU or say scanout device)
+     - munmap once you don't need the buffer any more
+
+    Therefore, for correctness and optimal performance, systems with the memory
+    cache shared by the GPU and CPU i.e. the "coherent" and also the
+    "incoherent" are always required to use SYNC_START and SYNC_END before and
+    after, respectively, when accessing the mapped address.
 
 2. Supporting existing mmap interfaces in importers
 
diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
index b2ac13b4ddaa..9810d1df0691 100644
--- a/drivers/dma-buf/dma-buf.c
+++ b/drivers/dma-buf/dma-buf.c
@@ -34,6 +34,8 @@
 #include <linux/poll.h>
 #include <linux/reservation.h>
 
+#include <uapi/linux/dma-buf.h>
+
 static inline int is_dma_buf_file(struct file *);
 
 struct dma_buf_list {
@@ -251,11 +253,54 @@ out:
 	return events;
 }
 
+static long dma_buf_ioctl(struct file *file,
+			  unsigned int cmd, unsigned long arg)
+{
+	struct dma_buf *dmabuf;
+	struct dma_buf_sync sync;
+	enum dma_data_direction direction;
+
+	dmabuf = file->private_data;
+
+	switch (cmd) {
+	case DMA_BUF_IOCTL_SYNC:
+		if (copy_from_user(&sync, (void __user *) arg, sizeof(sync)))
+			return -EFAULT;
+
+		if (sync.flags & ~DMA_BUF_SYNC_VALID_FLAGS_MASK)
+			return -EINVAL;
+
+		switch (sync.flags & DMA_BUF_SYNC_RW) {
+		case DMA_BUF_SYNC_READ:
+			direction = DMA_FROM_DEVICE;
+			break;
+		case DMA_BUF_SYNC_WRITE:
+			direction = DMA_TO_DEVICE;
+			break;
+		case DMA_BUF_SYNC_RW:
+			direction = DMA_BIDIRECTIONAL;
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		if (sync.flags & DMA_BUF_SYNC_END)
+			dma_buf_end_cpu_access(dmabuf, direction);
+		else
+			dma_buf_begin_cpu_access(dmabuf, direction);
+
+		return 0;
+	default:
+		return -ENOTTY;
+	}
+}
+
 static const struct file_operations dma_buf_fops = {
 	.release	= dma_buf_release,
 	.mmap		= dma_buf_mmap_internal,
 	.llseek		= dma_buf_llseek,
 	.poll		= dma_buf_poll,
+	.unlocked_ioctl	= dma_buf_ioctl,
 };
 
 /*
diff --git a/include/uapi/linux/dma-buf.h b/include/uapi/linux/dma-buf.h
new file mode 100644
index 000000000000..fb0dedb7c121
--- /dev/null
+++ b/include/uapi/linux/dma-buf.h
@@ -0,0 +1,40 @@
+/*
+ * Framework for buffer objects that can be shared across devices/subsystems.
+ *
+ * Copyright(C) 2015 Intel Ltd
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _DMA_BUF_UAPI_H_
+#define _DMA_BUF_UAPI_H_
+
+#include <linux/types.h>
+
+/* begin/end dma-buf functions used for userspace mmap. */
+struct dma_buf_sync {
+	__u64 flags;
+};
+
+#define DMA_BUF_SYNC_READ      (1 << 0)
+#define DMA_BUF_SYNC_WRITE     (2 << 0)
+#define DMA_BUF_SYNC_RW        (DMA_BUF_SYNC_READ | DMA_BUF_SYNC_WRITE)
+#define DMA_BUF_SYNC_START     (0 << 2)
+#define DMA_BUF_SYNC_END       (1 << 2)
+#define DMA_BUF_SYNC_VALID_FLAGS_MASK \
+	(DMA_BUF_SYNC_RW | DMA_BUF_SYNC_END)
+
+#define DMA_BUF_BASE		'b'
+#define DMA_BUF_IOCTL_SYNC	_IOW(DMA_BUF_BASE, 0, struct dma_buf_sync)
+
+#endif
-- 
cgit v1.2.3


From c8b1d8977eee3acc63a65811dd72ec4a93b74388 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Fri, 12 Feb 2016 16:40:12 +0200
Subject: usb: Add USB3.1 SuperSpeedPlus Isoc Endpoint Companion descriptor

USB3.1 specifies a SuperSpeedPlus Isoc endpoint companion descriptor
which is returned as part of the devices complete configuration
descriptor.

It contains number of bytes per service interval which is needed when
reserving bus time in the schedule for transfers over 48K bytes per
service interval.

If bmAttributes bit 7 is set in the old SuperSpeed Endpoint Companion
descriptor, it will be ollowed by the new SuperSpeedPlus Isoc Endpoint
Companion descriptor.

Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/uapi/linux/usb/ch9.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index 779a62aafafe..a65f1f328de1 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -234,6 +234,8 @@ struct usb_ctrlrequest {
 #define USB_DT_PIPE_USAGE		0x24
 /* From the USB 3.0 spec */
 #define	USB_DT_SS_ENDPOINT_COMP		0x30
+/* From the USB 3.1 spec */
+#define	USB_DT_SSP_ISOC_ENDPOINT_COMP	0x31
 
 /* Conventional codes for class-specific descriptors.  The convention is
  * defined in the USB "Common Class" Spec (3.11).  Individual class specs
@@ -613,6 +615,20 @@ static inline int usb_endpoint_interrupt_type(
 
 /*-------------------------------------------------------------------------*/
 
+/* USB_DT_SSP_ISOC_ENDPOINT_COMP: SuperSpeedPlus Isochronous Endpoint Companion
+ * descriptor
+ */
+struct usb_ssp_isoc_ep_comp_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__le16 wReseved;
+	__le32 dwBytesPerInterval;
+} __attribute__ ((packed));
+
+#define USB_DT_SSP_ISOC_EP_COMP_SIZE		8
+
+/*-------------------------------------------------------------------------*/
+
 /* USB_DT_SS_ENDPOINT_COMP: SuperSpeed Endpoint Companion descriptor */
 struct usb_ss_ep_comp_descriptor {
 	__u8  bLength;
@@ -646,6 +662,8 @@ usb_ss_max_streams(const struct usb_ss_ep_comp_descriptor *comp)
 
 /* Bits 1:0 of bmAttributes if this is an isoc endpoint */
 #define USB_SS_MULT(p)			(1 + ((p) & 0x3))
+/* Bit 7 of bmAttributes if a SSP isoc endpoint companion descriptor exists */
+#define USB_SS_SSP_ISOC_COMP(p)		((p) & (1 << 7))
 
 /*-------------------------------------------------------------------------*/
 
-- 
cgit v1.2.3


From faee822c5a7ab99de25cd34fcde3f8d37b6b9923 Mon Sep 17 00:00:00 2001
From: Mathias Nyman <mathias.nyman@linux.intel.com>
Date: Fri, 12 Feb 2016 16:40:14 +0200
Subject: usb: Add USB 3.1 Precision time measurement capability descriptor
 support

USB 3.1 devices that support precision time measurement have an
additional PTM cabaility descriptor as part of the full BOS descriptor

Look for this descriptor while parsing the BOS descriptor, and store it in
struct usb_hub_bos if it exists.

Signed-off-by: Mathias Nyman <mathias.nyman@linux.intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/config.c    |  3 +++
 include/linux/usb.h          |  1 +
 include/uapi/linux/usb/ch9.h | 10 ++++++++++
 3 files changed, 14 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index 58d089802ab3..5eb1a87228b4 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -894,6 +894,9 @@ int usb_get_bos_descriptor(struct usb_device *dev)
 			dev->bos->ss_id =
 				(struct usb_ss_container_id_descriptor *)buffer;
 			break;
+		case USB_PTM_CAP_TYPE:
+			dev->bos->ptm_cap =
+				(struct usb_ptm_cap_descriptor *)buffer;
 		default:
 			break;
 		}
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 8fc881af8735..6a9a0c28415d 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -332,6 +332,7 @@ struct usb_host_bos {
 	struct usb_ss_cap_descriptor	*ss_cap;
 	struct usb_ssp_cap_descriptor	*ssp_cap;
 	struct usb_ss_container_id_descriptor	*ss_id;
+	struct usb_ptm_cap_descriptor	*ptm_cap;
 };
 
 int __usb_get_extra_descriptor(char *buffer, unsigned size,
diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index a65f1f328de1..252ac16635dc 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -912,6 +912,16 @@ struct usb_ssp_cap_descriptor {
 #define USB_SSP_SUBLINK_SPEED_LSM	(0xff << 16)	/* Lanespeed mantissa */
 } __attribute__((packed));
 
+/*
+ * Precision time measurement capability descriptor: advertised by devices and
+ * hubs that support PTM
+ */
+#define	USB_PTM_CAP_TYPE	0xb
+struct usb_ptm_cap_descriptor {
+	__u8  bLength;
+	__u8  bDescriptorType;
+	__u8  bDevCapabilityType;
+} __attribute__((packed));
 
 /*-------------------------------------------------------------------------*/
 
-- 
cgit v1.2.3


From f7d34b445abc00e979b7cf36b9580ac3d1a47cd8 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sesse@google.com>
Date: Wed, 3 Feb 2016 22:58:26 +0100
Subject: USB: Add support for usbfs zerocopy.

Add a new interface for userspace to preallocate memory that can be
used with usbfs. This gives two primary benefits:

 - Zerocopy; data no longer needs to be copied between the userspace
   and the kernel, but can instead be read directly by the driver from
   userspace's buffers. This works for all kinds of transfers (even if
   nonsensical for control and interrupt transfers); isochronous also
   no longer need to memset() the buffer to zero to avoid leaking kernel data.

 - Once the buffers are allocated, USB transfers can no longer fail due to
   memory fragmentation; previously, long-running programs could run into
   problems finding a large enough contiguous memory chunk, especially on
   embedded systems or at high rates.

Memory is allocated by using mmap() against the usbfs file descriptor,
and similarly deallocated by munmap(). Once memory has been allocated,
using it as pointers to a bulk or isochronous operation means you will
automatically get zerocopy behavior. Note that this also means you cannot
modify outgoing data until the transfer is complete. The same holds for
data on the same cache lines as incoming data; DMA modifying them at the
same time could lead to your changes being overwritten.

There's a new capability USBDEVFS_CAP_MMAP that userspace can query to see
if the running kernel supports this functionality, if just trying mmap() is
not acceptable.

Largely based on a patch by Markus Rechberger with some updates. The original
patch can be found at:

  http://sundtek.de/support/devio_mmap_v0.4.diff

Signed-off-by: Steinar H. Gunderson <sesse@google.com>
Signed-off-by: Markus Rechberger <mrechberger@gmail.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/devio.c          | 227 +++++++++++++++++++++++++++++++++-----
 include/uapi/linux/usbdevice_fs.h |   1 +
 2 files changed, 203 insertions(+), 25 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 13a4b9f48739..39da1662b76f 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -50,6 +50,7 @@
 #include <linux/user_namespace.h>
 #include <linux/scatterlist.h>
 #include <linux/uaccess.h>
+#include <linux/dma-mapping.h>
 #include <asm/byteorder.h>
 #include <linux/moduleparam.h>
 
@@ -69,6 +70,7 @@ struct usb_dev_state {
 	spinlock_t lock;            /* protects the async urb lists */
 	struct list_head async_pending;
 	struct list_head async_completed;
+	struct list_head memory_list;
 	wait_queue_head_t wait;     /* wake up if a request completed */
 	unsigned int discsignr;
 	struct pid *disc_pid;
@@ -79,6 +81,17 @@ struct usb_dev_state {
 	u32 disabled_bulk_eps;
 };
 
+struct usb_memory {
+	struct list_head memlist;
+	int vma_use_count;
+	int urb_use_count;
+	u32 size;
+	void *mem;
+	dma_addr_t dma_handle;
+	unsigned long vm_start;
+	struct usb_dev_state *ps;
+};
+
 struct async {
 	struct list_head asynclist;
 	struct usb_dev_state *ps;
@@ -89,6 +102,7 @@ struct async {
 	void __user *userbuffer;
 	void __user *userurb;
 	struct urb *urb;
+	struct usb_memory *usbm;
 	unsigned int mem_usage;
 	int status;
 	u32 secid;
@@ -162,6 +176,111 @@ static int connected(struct usb_dev_state *ps)
 			ps->dev->state != USB_STATE_NOTATTACHED);
 }
 
+static void dec_usb_memory_use_count(struct usb_memory *usbm, int *count)
+{
+	struct usb_dev_state *ps = usbm->ps;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ps->lock, flags);
+	--*count;
+	if (usbm->urb_use_count == 0 && usbm->vma_use_count == 0) {
+		list_del(&usbm->memlist);
+		spin_unlock_irqrestore(&ps->lock, flags);
+
+		usb_free_coherent(ps->dev, usbm->size, usbm->mem,
+				usbm->dma_handle);
+		usbfs_decrease_memory_usage(
+			usbm->size + sizeof(struct usb_memory));
+		kfree(usbm);
+	} else {
+		spin_unlock_irqrestore(&ps->lock, flags);
+	}
+}
+
+static void usbdev_vm_open(struct vm_area_struct *vma)
+{
+	struct usb_memory *usbm = vma->vm_private_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&usbm->ps->lock, flags);
+	++usbm->vma_use_count;
+	spin_unlock_irqrestore(&usbm->ps->lock, flags);
+}
+
+static void usbdev_vm_close(struct vm_area_struct *vma)
+{
+	struct usb_memory *usbm = vma->vm_private_data;
+
+	dec_usb_memory_use_count(usbm, &usbm->vma_use_count);
+}
+
+struct vm_operations_struct usbdev_vm_ops = {
+	.open = usbdev_vm_open,
+	.close = usbdev_vm_close
+};
+
+static int usbdev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct usb_memory *usbm = NULL;
+	struct usb_dev_state *ps = file->private_data;
+	size_t size = vma->vm_end - vma->vm_start;
+	void *mem;
+	unsigned long flags;
+	dma_addr_t dma_handle;
+	int ret;
+
+	ret = usbfs_increase_memory_usage(size + sizeof(struct usb_memory));
+	if (ret)
+		goto error;
+
+	usbm = kzalloc(sizeof(struct usb_memory), GFP_KERNEL);
+	if (!usbm) {
+		ret = -ENOMEM;
+		goto error_decrease_mem;
+	}
+
+	mem = usb_alloc_coherent(ps->dev, size, GFP_USER, &dma_handle);
+	if (!mem) {
+		ret = -ENOMEM;
+		goto error_free_usbm;
+	}
+
+	memset(mem, 0, size);
+
+	usbm->mem = mem;
+	usbm->dma_handle = dma_handle;
+	usbm->size = size;
+	usbm->ps = ps;
+	usbm->vm_start = vma->vm_start;
+	usbm->vma_use_count = 1;
+	INIT_LIST_HEAD(&usbm->memlist);
+
+	if (remap_pfn_range(vma, vma->vm_start,
+			virt_to_phys(usbm->mem) >> PAGE_SHIFT,
+			size, vma->vm_page_prot) < 0) {
+		dec_usb_memory_use_count(usbm, &usbm->vma_use_count);
+		return -EAGAIN;
+	}
+
+	vma->vm_flags |= VM_IO;
+	vma->vm_flags |= (VM_DONTEXPAND | VM_DONTDUMP);
+	vma->vm_ops = &usbdev_vm_ops;
+	vma->vm_private_data = usbm;
+
+	spin_lock_irqsave(&ps->lock, flags);
+	list_add_tail(&usbm->memlist, &ps->memory_list);
+	spin_unlock_irqrestore(&ps->lock, flags);
+
+	return 0;
+
+error_free_usbm:
+	kfree(usbm);
+error_decrease_mem:
+	usbfs_decrease_memory_usage(size + sizeof(struct usb_memory));
+error:
+	return ret;
+}
+
 static ssize_t usbdev_read(struct file *file, char __user *buf, size_t nbytes,
 			   loff_t *ppos)
 {
@@ -278,8 +397,13 @@ static void free_async(struct async *as)
 		if (sg_page(&as->urb->sg[i]))
 			kfree(sg_virt(&as->urb->sg[i]));
 	}
+
 	kfree(as->urb->sg);
-	kfree(as->urb->transfer_buffer);
+	if (as->usbm == NULL)
+		kfree(as->urb->transfer_buffer);
+	else
+		dec_usb_memory_use_count(as->usbm, &as->usbm->urb_use_count);
+
 	kfree(as->urb->setup_packet);
 	usb_free_urb(as->urb);
 	usbfs_decrease_memory_usage(as->mem_usage);
@@ -893,6 +1017,7 @@ static int usbdev_open(struct inode *inode, struct file *file)
 	INIT_LIST_HEAD(&ps->list);
 	INIT_LIST_HEAD(&ps->async_pending);
 	INIT_LIST_HEAD(&ps->async_completed);
+	INIT_LIST_HEAD(&ps->memory_list);
 	init_waitqueue_head(&ps->wait);
 	ps->discsignr = 0;
 	ps->disc_pid = get_pid(task_pid(current));
@@ -945,6 +1070,7 @@ static int usbdev_release(struct inode *inode, struct file *file)
 		free_async(as);
 		as = async_getcompleted(ps);
 	}
+
 	kfree(ps);
 	return 0;
 }
@@ -1266,6 +1392,31 @@ static int proc_setconfig(struct usb_dev_state *ps, void __user *arg)
 	return status;
 }
 
+static struct usb_memory *
+find_memory_area(struct usb_dev_state *ps, const struct usbdevfs_urb *uurb)
+{
+	struct usb_memory *usbm = NULL, *iter;
+	unsigned long flags;
+	unsigned long uurb_start = (unsigned long)uurb->buffer;
+
+	spin_lock_irqsave(&ps->lock, flags);
+	list_for_each_entry(iter, &ps->memory_list, memlist) {
+		if (uurb_start >= iter->vm_start &&
+				uurb_start < iter->vm_start + iter->size) {
+			if (uurb->buffer_length > iter->vm_start + iter->size -
+					uurb_start) {
+				usbm = ERR_PTR(-EINVAL);
+			} else {
+				usbm = iter;
+				usbm->urb_use_count++;
+			}
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&ps->lock, flags);
+	return usbm;
+}
+
 static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb,
 			struct usbdevfs_iso_packet_desc __user *iso_frame_desc,
 			void __user *arg)
@@ -1421,6 +1572,19 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb
 		goto error;
 	}
 
+	as->usbm = find_memory_area(ps, uurb);
+	if (IS_ERR(as->usbm)) {
+		ret = PTR_ERR(as->usbm);
+		as->usbm = NULL;
+		goto error;
+	}
+
+	/* do not use SG buffers when memory mapped segments
+	 * are in use
+	 */
+	if (as->usbm)
+		num_sgs = 0;
+
 	u += sizeof(struct async) + sizeof(struct urb) + uurb->buffer_length +
 	     num_sgs * sizeof(struct scatterlist);
 	ret = usbfs_increase_memory_usage(u);
@@ -1458,29 +1622,35 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb
 			totlen -= u;
 		}
 	} else if (uurb->buffer_length > 0) {
-		as->urb->transfer_buffer = kmalloc(uurb->buffer_length,
-				GFP_KERNEL);
-		if (!as->urb->transfer_buffer) {
-			ret = -ENOMEM;
-			goto error;
-		}
+		if (as->usbm) {
+			unsigned long uurb_start = (unsigned long)uurb->buffer;
 
-		if (!is_in) {
-			if (copy_from_user(as->urb->transfer_buffer,
-					   uurb->buffer,
-					   uurb->buffer_length)) {
-				ret = -EFAULT;
+			as->urb->transfer_buffer = as->usbm->mem +
+					(uurb_start - as->usbm->vm_start);
+		} else {
+			as->urb->transfer_buffer = kmalloc(uurb->buffer_length,
+					GFP_KERNEL);
+			if (!as->urb->transfer_buffer) {
+				ret = -ENOMEM;
 				goto error;
 			}
-		} else if (uurb->type == USBDEVFS_URB_TYPE_ISO) {
-			/*
-			 * Isochronous input data may end up being
-			 * discontiguous if some of the packets are short.
-			 * Clear the buffer so that the gaps don't leak
-			 * kernel data to userspace.
-			 */
-			memset(as->urb->transfer_buffer, 0,
-					uurb->buffer_length);
+			if (!is_in) {
+				if (copy_from_user(as->urb->transfer_buffer,
+						   uurb->buffer,
+						   uurb->buffer_length)) {
+					ret = -EFAULT;
+					goto error;
+				}
+			} else if (uurb->type == USBDEVFS_URB_TYPE_ISO) {
+				/*
+				 * Isochronous input data may end up being
+				 * discontiguous if some of the packets are
+				 * short. Clear the buffer so that the gaps
+				 * don't leak kernel data to userspace.
+				 */
+				memset(as->urb->transfer_buffer, 0,
+						uurb->buffer_length);
+			}
 		}
 	}
 	as->urb->dev = ps->dev;
@@ -1527,10 +1697,14 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb
 	isopkt = NULL;
 	as->ps = ps;
 	as->userurb = arg;
-	if (is_in && uurb->buffer_length > 0)
+	if (as->usbm) {
+		unsigned long uurb_start = (unsigned long)uurb->buffer;
+
+		as->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
+		as->urb->transfer_dma = as->usbm->dma_handle +
+				(uurb_start - as->usbm->vm_start);
+	} else if (is_in && uurb->buffer_length > 0)
 		as->userbuffer = uurb->buffer;
-	else
-		as->userbuffer = NULL;
 	as->signr = uurb->signr;
 	as->ifnum = ifnum;
 	as->pid = get_pid(task_pid(current));
@@ -1586,6 +1760,8 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb
 	return 0;
 
  error:
+	if (as && as->usbm)
+		dec_usb_memory_use_count(as->usbm, &as->usbm->urb_use_count);
 	kfree(isopkt);
 	kfree(dr);
 	if (as)
@@ -2039,7 +2215,7 @@ static int proc_get_capabilities(struct usb_dev_state *ps, void __user *arg)
 	__u32 caps;
 
 	caps = USBDEVFS_CAP_ZERO_PACKET | USBDEVFS_CAP_NO_PACKET_SIZE_LIM |
-			USBDEVFS_CAP_REAP_AFTER_DISCONNECT;
+			USBDEVFS_CAP_REAP_AFTER_DISCONNECT | USBDEVFS_CAP_MMAP;
 	if (!ps->dev->bus->no_stop_on_short)
 		caps |= USBDEVFS_CAP_BULK_CONTINUATION;
 	if (ps->dev->bus->sg_tablesize)
@@ -2365,6 +2541,7 @@ const struct file_operations usbdev_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl =   usbdev_compat_ioctl,
 #endif
+	.mmap =           usbdev_mmap,
 	.open =		  usbdev_open,
 	.release =	  usbdev_release,
 };
diff --git a/include/uapi/linux/usbdevice_fs.h b/include/uapi/linux/usbdevice_fs.h
index 019ba1e0799a..ecbd17650e6c 100644
--- a/include/uapi/linux/usbdevice_fs.h
+++ b/include/uapi/linux/usbdevice_fs.h
@@ -134,6 +134,7 @@ struct usbdevfs_hub_portinfo {
 #define USBDEVFS_CAP_NO_PACKET_SIZE_LIM		0x04
 #define USBDEVFS_CAP_BULK_SCATTER_GATHER	0x08
 #define USBDEVFS_CAP_REAP_AFTER_DISCONNECT	0x10
+#define USBDEVFS_CAP_MMAP			0x20
 
 /* USBDEVFS_DISCONNECT_CLAIM flags & struct */
 
-- 
cgit v1.2.3


From 360104e3b8485f4923ffc202c1c2b36841de673c Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Date: Fri, 12 Feb 2016 15:44:31 -0200
Subject: [media] media.h: get rid of MEDIA_ENT_F_CONN_TEST

Defining it as a connector was a bad idea. Remove it while it is
not too late.

Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 Documentation/DocBook/media/v4l/media-types.xml | 4 ----
 drivers/media/v4l2-core/v4l2-mc.c               | 1 -
 include/uapi/linux/media.h                      | 2 --
 3 files changed, 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/DocBook/media/v4l/media-types.xml b/Documentation/DocBook/media/v4l/media-types.xml
index 751c3d027103..8b4fa39cf611 100644
--- a/Documentation/DocBook/media/v4l/media-types.xml
+++ b/Documentation/DocBook/media/v4l/media-types.xml
@@ -56,10 +56,6 @@
 	    <entry><constant>MEDIA_ENT_F_CONN_COMPOSITE</constant></entry>
 	    <entry>Connector for a RGB composite signal.</entry>
 	  </row>
-	  <row>
-	    <entry><constant>MEDIA_ENT_F_CONN_TEST</constant></entry>
-	    <entry>Connector for a test generator.</entry>
-	  </row>
 	  <row>
 	    <entry><constant>MEDIA_ENT_F_CAM_SENSOR</constant></entry>
 	    <entry>Camera video sensor entity.</entry>
diff --git a/drivers/media/v4l2-core/v4l2-mc.c b/drivers/media/v4l2-core/v4l2-mc.c
index 9d79d599d979..a7f41b323522 100644
--- a/drivers/media/v4l2-core/v4l2-mc.c
+++ b/drivers/media/v4l2-core/v4l2-mc.c
@@ -241,7 +241,6 @@ int v4l2_mc_create_media_graph(struct media_device *mdev)
 			break;
 		case MEDIA_ENT_F_CONN_SVIDEO:
 		case MEDIA_ENT_F_CONN_COMPOSITE:
-		case MEDIA_ENT_F_CONN_TEST:
 			ret = media_create_pad_link(entity, 0, decoder,
 						    DEMOD_PAD_IF_INPUT,
 						    flags);
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index c9eb42a6c021..7113b1a8cb4e 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -78,8 +78,6 @@ struct media_device_info {
 #define MEDIA_ENT_F_CONN_RF		(MEDIA_ENT_F_BASE + 21)
 #define MEDIA_ENT_F_CONN_SVIDEO		(MEDIA_ENT_F_BASE + 22)
 #define MEDIA_ENT_F_CONN_COMPOSITE	(MEDIA_ENT_F_BASE + 23)
-/* For internal test signal generators and other debug connectors */
-#define MEDIA_ENT_F_CONN_TEST		(MEDIA_ENT_F_BASE + 24)
 
 /*
  * I/O entities
-- 
cgit v1.2.3


From 83326e43f27e9a8a501427a0060f8af519a39bb2 Mon Sep 17 00:00:00 2001
From: Andrey Smetanin <asmetanin@virtuozzo.com>
Date: Thu, 11 Feb 2016 16:45:01 +0300
Subject: kvm/x86: Hyper-V VMBus hypercall userspace exit

The patch implements KVM_EXIT_HYPERV userspace exit
functionality for Hyper-V VMBus hypercalls:
HV_X64_HCALL_POST_MESSAGE, HV_X64_HCALL_SIGNAL_EVENT.

Changes v3:
* use vcpu->arch.complete_userspace_io to setup hypercall
result

Changes v2:
* use KVM_EXIT_HYPERV for hypercalls

Signed-off-by: Andrey Smetanin <asmetanin@virtuozzo.com>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
CC: Gleb Natapov <gleb@kernel.org>
CC: Paolo Bonzini <pbonzini@redhat.com>
CC: Joerg Roedel <joro@8bytes.org>
CC: "K. Y. Srinivasan" <kys@microsoft.com>
CC: Haiyang Zhang <haiyangz@microsoft.com>
CC: Roman Kagan <rkagan@virtuozzo.com>
CC: Denis V. Lunev <den@openvz.org>
CC: qemu-devel@nongnu.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/api.txt |  6 ++++++
 arch/x86/kvm/hyperv.c             | 39 ++++++++++++++++++++++++++++++++-------
 include/uapi/linux/kvm.h          |  6 ++++++
 3 files changed, 44 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 07e4cdf02407..4a661e555c09 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3339,6 +3339,7 @@ EOI was received.
 
 		struct kvm_hyperv_exit {
 #define KVM_EXIT_HYPERV_SYNIC          1
+#define KVM_EXIT_HYPERV_HCALL          2
 			__u32 type;
 			union {
 				struct {
@@ -3347,6 +3348,11 @@ EOI was received.
 					__u64 evt_page;
 					__u64 msg_page;
 				} synic;
+				struct {
+					__u64 input;
+					__u64 result;
+					__u64 params[2];
+				} hcall;
 			} u;
 		};
 		/* KVM_EXIT_HYPERV */
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index e8af5978762b..5ff3485acb60 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1043,6 +1043,27 @@ bool kvm_hv_hypercall_enabled(struct kvm *kvm)
 	return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
 }
 
+static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+	bool longmode;
+
+	longmode = is_64_bit_mode(vcpu);
+	if (longmode)
+		kvm_register_write(vcpu, VCPU_REGS_RAX, result);
+	else {
+		kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32);
+		kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff);
+	}
+}
+
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result);
+	return 1;
+}
+
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 {
 	u64 param, ingpa, outgpa, ret;
@@ -1093,6 +1114,16 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 	case HVCALL_NOTIFY_LONG_SPIN_WAIT:
 		kvm_vcpu_on_spin(vcpu);
 		break;
+	case HVCALL_POST_MESSAGE:
+	case HVCALL_SIGNAL_EVENT:
+		vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+		vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+		vcpu->run->hyperv.u.hcall.input = param;
+		vcpu->run->hyperv.u.hcall.params[0] = ingpa;
+		vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+		vcpu->arch.complete_userspace_io =
+				kvm_hv_hypercall_complete_userspace;
+		return 0;
 	default:
 		res = HV_STATUS_INVALID_HYPERCALL_CODE;
 		break;
@@ -1100,12 +1131,6 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 
 set_result:
 	ret = res | (((u64)rep_done & 0xfff) << 32);
-	if (longmode) {
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
-	} else {
-		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
-	}
-
+	kvm_hv_hypercall_set_result(vcpu, ret);
 	return 1;
 }
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a2fe0ac1d61a..82581b6e944d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -157,6 +157,7 @@ struct kvm_s390_skeys {
 
 struct kvm_hyperv_exit {
 #define KVM_EXIT_HYPERV_SYNIC          1
+#define KVM_EXIT_HYPERV_HCALL          2
 	__u32 type;
 	union {
 		struct {
@@ -165,6 +166,11 @@ struct kvm_hyperv_exit {
 			__u64 evt_page;
 			__u64 msg_page;
 		} synic;
+		struct {
+			__u64 input;
+			__u64 result;
+			__u64 params[2];
+		} hcall;
 	} u;
 };
 
-- 
cgit v1.2.3


From 5e2bec7c2248ae27c5b16cd97215ae05c1d39179 Mon Sep 17 00:00:00 2001
From: Aditya Kali <adityakali@google.com>
Date: Fri, 29 Jan 2016 02:54:05 -0600
Subject: sched: new clone flag CLONE_NEWCGROUP for cgroup namespace

CLONE_NEWCGROUP will be used to create new cgroup namespace.

Signed-off-by: Aditya Kali <adityakali@google.com>
Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/uapi/linux/sched.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index cc89ddefa926..5f0fe019a720 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -21,8 +21,7 @@
 #define CLONE_DETACHED		0x00400000	/* Unused, ignored */
 #define CLONE_UNTRACED		0x00800000	/* set if the tracing process can't force CLONE_PTRACE on this clone */
 #define CLONE_CHILD_SETTID	0x01000000	/* set the TID in the child */
-/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
-   and is now available for re-use. */
+#define CLONE_NEWCGROUP		0x02000000	/* New cgroup namespace */
 #define CLONE_NEWUTS		0x04000000	/* New utsname namespace */
 #define CLONE_NEWIPC		0x08000000	/* New ipc namespace */
 #define CLONE_NEWUSER		0x10000000	/* New user namespace */
-- 
cgit v1.2.3


From cd9b266095f422267bddbec88f9098b48ea548fc Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 11 Feb 2016 22:02:53 -0800
Subject: tcp: add tcpi_min_rtt and tcpi_notsent_bytes to tcp_info

tcpi_min_rtt reports the minimal rtt observed by TCP stack for the flow,
in usec unit. Might be ~0U if not yet known.

tcpi_notsent_bytes reports the amount of bytes in the write queue that
were not yet sent.

This is done in a single patch to not add a temporary 32bit padding hole
in tcp_info.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/tcp.h | 3 +++
 net/ipv4/tcp.c           | 6 ++++++
 2 files changed, 9 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 65a77b071e22..fe95446e9abf 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -196,6 +196,9 @@ struct tcp_info {
 	__u64	tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
 	__u32	tcpi_segs_out;	     /* RFC4898 tcpEStatsPerfSegsOut */
 	__u32	tcpi_segs_in;	     /* RFC4898 tcpEStatsPerfSegsIn */
+
+	__u32	tcpi_notsent_bytes;
+	__u32	tcpi_min_rtt;
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 014f18e2f7b3..f93150d15199 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2642,6 +2642,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 now = tcp_time_stamp;
 	unsigned int start;
+	int notsent_bytes;
 	u64 rate64;
 	u32 rate;
 
@@ -2722,6 +2723,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
 	info->tcpi_segs_out = tp->segs_out;
 	info->tcpi_segs_in = tp->segs_in;
+
+	notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
+	info->tcpi_notsent_bytes = max(0, notsent_bytes);
+
+	info->tcpi_min_rtt = tcp_min_rtt(tp);
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
-- 
cgit v1.2.3


From d1b4c689d4130bcfd3532680b64db562300716b6 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 18 Feb 2016 15:03:24 +0100
Subject: netlink: remove mmapped netlink support

mmapped netlink has a number of unresolved issues:

- TX zerocopy support had to be disabled more than a year ago via
  commit 4682a0358639b29cf ("netlink: Always copy on mmap TX.")
  because the content of the mmapped area can change after netlink
  attribute validation but before message processing.

- RX support was implemented mainly to speed up nfqueue dumping packet
  payload to userspace.  However, since commit ae08ce0021087a5d812d2
  ("netfilter: nfnetlink_queue: zero copy support") we avoid one copy
  with the socket-based interface too (via the skb_zerocopy helper).

The other problem is that skbs attached to mmaped netlink socket
behave different from normal skbs:

- they don't have a shinfo area, so all functions that use skb_shinfo()
(e.g. skb_clone) cannot be used.

- reserving headroom prevents userspace from seeing the content as
it expects message to start at skb->head.
See for instance
commit aa3a022094fa ("netlink: not trim skb for mmaped socket when dump").

- skbs handed e.g. to netlink_ack must have non-NULL skb->sk, else we
crash because it needs the sk to check if a tx ring is attached.

Also not obvious, leads to non-intuitive bug fixes such as 7c7bdf359
("netfilter: nfnetlink: use original skbuff when acking batches").

mmaped netlink also didn't play nicely with the skb_zerocopy helper
used by nfqueue and openvswitch.  Daniel Borkmann fixed this via
commit 6bb0fef489f6 ("netlink, mmap: fix edge-case leakages in nf queue
zero-copy")' but at the cost of also needing to provide remaining
length to the allocation function.

nfqueue also has problems when used with mmaped rx netlink:
- mmaped netlink doesn't allow use of nfqueue batch verdict messages.
  Problem is that in the mmap case, the allocation time also determines
  the ordering in which the frame will be seen by userspace (A
  allocating before B means that A is located in earlier ring slot,
  but this also means that B might get a lower sequence number then A
  since seqno is decided later.  To fix this we would need to extend the
  spinlocked region to also cover the allocation and message setup which
  isn't desirable.
- nfqueue can now be configured to queue large (GSO) skbs to userspace.
  Queing GSO packets is faster than having to force a software segmentation
  in the kernel, so this is a desirable option.  However, with a mmap based
  ring one has to use 64kb per ring slot element, else mmap has to fall back
  to the socket path (NL_MMAP_STATUS_COPY) for all large packets.

To use the mmap interface, userspace not only has to probe for mmap netlink
support, it also has to implement a recv/socket receive path in order to
handle messages that exceed the size of an rx ring element.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Ken-ichirou MATSUZAWA <chamaken@gmail.com>
Cc: Pablo Neira Ayuso <pablo@netfilter.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/netlink_mmap.txt | 332 -------------
 include/uapi/linux/netlink.h              |   4 +
 include/uapi/linux/netlink_diag.h         |   2 +
 net/netlink/Kconfig                       |   9 -
 net/netlink/af_netlink.c                  | 754 +-----------------------------
 net/netlink/af_netlink.h                  |  15 -
 net/netlink/diag.c                        |  39 --
 7 files changed, 15 insertions(+), 1140 deletions(-)
 delete mode 100644 Documentation/networking/netlink_mmap.txt

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/netlink_mmap.txt b/Documentation/networking/netlink_mmap.txt
deleted file mode 100644
index 54f10478e8e3..000000000000
--- a/Documentation/networking/netlink_mmap.txt
+++ /dev/null
@@ -1,332 +0,0 @@
-This file documents how to use memory mapped I/O with netlink.
-
-Author: Patrick McHardy <kaber@trash.net>
-
-Overview
---------
-
-Memory mapped netlink I/O can be used to increase throughput and decrease
-overhead of unicast receive and transmit operations. Some netlink subsystems
-require high throughput, these are mainly the netfilter subsystems
-nfnetlink_queue and nfnetlink_log, but it can also help speed up large
-dump operations of f.i. the routing database.
-
-Memory mapped netlink I/O used two circular ring buffers for RX and TX which
-are mapped into the processes address space.
-
-The RX ring is used by the kernel to directly construct netlink messages into
-user-space memory without copying them as done with regular socket I/O,
-additionally as long as the ring contains messages no recvmsg() or poll()
-syscalls have to be issued by user-space to get more message.
-
-The TX ring is used to process messages directly from user-space memory, the
-kernel processes all messages contained in the ring using a single sendmsg()
-call.
-
-Usage overview
---------------
-
-In order to use memory mapped netlink I/O, user-space needs three main changes:
-
-- ring setup
-- conversion of the RX path to get messages from the ring instead of recvmsg()
-- conversion of the TX path to construct messages into the ring
-
-Ring setup is done using setsockopt() to provide the ring parameters to the
-kernel, then a call to mmap() to map the ring into the processes address space:
-
-- setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &params, sizeof(params));
-- setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &params, sizeof(params));
-- ring = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)
-
-Usage of either ring is optional, but even if only the RX ring is used the
-mapping still needs to be writable in order to update the frame status after
-processing.
-
-Conversion of the reception path involves calling poll() on the file
-descriptor, once the socket is readable the frames from the ring are
-processed in order until no more messages are available, as indicated by
-a status word in the frame header.
-
-On kernel side, in order to make use of memory mapped I/O on receive, the
-originating netlink subsystem needs to support memory mapped I/O, otherwise
-it will use an allocated socket buffer as usual and the contents will be
- copied to the ring on transmission, nullifying most of the performance gains.
-Dumps of kernel databases automatically support memory mapped I/O.
-
-Conversion of the transmit path involves changing message construction to
-use memory from the TX ring instead of (usually) a buffer declared on the
-stack and setting up the frame header appropriately. Optionally poll() can
-be used to wait for free frames in the TX ring.
-
-Structured and definitions for using memory mapped I/O are contained in
-<linux/netlink.h>.
-
-RX and TX rings
-----------------
-
-Each ring contains a number of continuous memory blocks, containing frames of
-fixed size dependent on the parameters used for ring setup.
-
-Ring:	[ block 0 ]
-		[ frame 0 ]
-		[ frame 1 ]
-	[ block 1 ]
-		[ frame 2 ]
-		[ frame 3 ]
-	...
-	[ block n ]
-		[ frame 2 * n ]
-		[ frame 2 * n + 1 ]
-
-The blocks are only visible to the kernel, from the point of view of user-space
-the ring just contains the frames in a continuous memory zone.
-
-The ring parameters used for setting up the ring are defined as follows:
-
-struct nl_mmap_req {
-	unsigned int	nm_block_size;
-	unsigned int	nm_block_nr;
-	unsigned int	nm_frame_size;
-	unsigned int	nm_frame_nr;
-};
-
-Frames are grouped into blocks, where each block is a continuous region of memory
-and holds nm_block_size / nm_frame_size frames. The total number of frames in
-the ring is nm_frame_nr. The following invariants hold:
-
-- frames_per_block = nm_block_size / nm_frame_size
-
-- nm_frame_nr = frames_per_block * nm_block_nr
-
-Some parameters are constrained, specifically:
-
-- nm_block_size must be a multiple of the architectures memory page size.
-  The getpagesize() function can be used to get the page size.
-
-- nm_frame_size must be equal or larger to NL_MMAP_HDRLEN, IOW a frame must be
-  able to hold at least the frame header
-
-- nm_frame_size must be smaller or equal to nm_block_size
-
-- nm_frame_size must be a multiple of NL_MMAP_MSG_ALIGNMENT
-
-- nm_frame_nr must equal the actual number of frames as specified above.
-
-When the kernel can't allocate physically continuous memory for a ring block,
-it will fall back to use physically discontinuous memory. This might affect
-performance negatively, in order to avoid this the nm_frame_size parameter
-should be chosen to be as small as possible for the required frame size and
-the number of blocks should be increased instead.
-
-Ring frames
-------------
-
-Each frames contain a frame header, consisting of a synchronization word and some
-meta-data, and the message itself.
-
-Frame:	[ header message ]
-
-The frame header is defined as follows:
-
-struct nl_mmap_hdr {
-	unsigned int	nm_status;
-	unsigned int	nm_len;
-	__u32		nm_group;
-	/* credentials */
-	__u32		nm_pid;
-	__u32		nm_uid;
-	__u32		nm_gid;
-};
-
-- nm_status is used for synchronizing processing between the kernel and user-
-  space and specifies ownership of the frame as well as the operation to perform
-
-- nm_len contains the length of the message contained in the data area
-
-- nm_group specified the destination multicast group of message
-
-- nm_pid, nm_uid and nm_gid contain the netlink pid, UID and GID of the sending
-  process. These values correspond to the data available using SOCK_PASSCRED in
-  the SCM_CREDENTIALS cmsg.
-
-The possible values in the status word are:
-
-- NL_MMAP_STATUS_UNUSED:
-	RX ring:	frame belongs to the kernel and contains no message
-			for user-space. Approriate action is to invoke poll()
-			to wait for new messages.
-
-	TX ring:	frame belongs to user-space and can be used for
-			message construction.
-
-- NL_MMAP_STATUS_RESERVED:
-	RX ring only:	frame is currently used by the kernel for message
-			construction and contains no valid message yet.
-			Appropriate action is to invoke poll() to wait for
-			new messages.
-
-- NL_MMAP_STATUS_VALID:
-	RX ring:	frame contains a valid message. Approriate action is
-			to process the message and release the frame back to
-			the kernel by setting the status to
-			NL_MMAP_STATUS_UNUSED or queue the frame by setting the
-			status to NL_MMAP_STATUS_SKIP.
-
-	TX ring:	the frame contains a valid message from user-space to
-			be processed by the kernel. After completing processing
-			the kernel will release the frame back to user-space by
-			setting the status to NL_MMAP_STATUS_UNUSED.
-
-- NL_MMAP_STATUS_COPY:
-	RX ring only:	a message is ready to be processed but could not be
-			stored in the ring, either because it exceeded the
-			frame size or because the originating subsystem does
-			not support memory mapped I/O. Appropriate action is
-			to invoke recvmsg() to receive the message and release
-			the frame back to the kernel by setting the status to
-			NL_MMAP_STATUS_UNUSED.
-
-- NL_MMAP_STATUS_SKIP:
-	RX ring only:	user-space queued the message for later processing, but
-			processed some messages following it in the ring. The
-			kernel should skip this frame when looking for unused
-			frames.
-
-The data area of a frame begins at a offset of NL_MMAP_HDRLEN relative to the
-frame header.
-
-TX limitations
---------------
-
-As of Jan 2015 the message is always copied from the ring frame to an
-allocated buffer due to unresolved security concerns.
-See commit 4682a0358639b29cf ("netlink: Always copy on mmap TX.").
-
-Example
--------
-
-Ring setup:
-
-	unsigned int block_size = 16 * getpagesize();
-	struct nl_mmap_req req = {
-		.nm_block_size		= block_size,
-		.nm_block_nr		= 64,
-		.nm_frame_size		= 16384,
-		.nm_frame_nr		= 64 * block_size / 16384,
-	};
-	unsigned int ring_size;
-	void *rx_ring, *tx_ring;
-
-	/* Configure ring parameters */
-	if (setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &req, sizeof(req)) < 0)
-		exit(1);
-	if (setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &req, sizeof(req)) < 0)
-		exit(1)
-
-	/* Calculate size of each individual ring */
-	ring_size = req.nm_block_nr * req.nm_block_size;
-
-	/* Map RX/TX rings. The TX ring is located after the RX ring */
-	rx_ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE,
-		       MAP_SHARED, fd, 0);
-	if ((long)rx_ring == -1L)
-		exit(1);
-	tx_ring = rx_ring + ring_size:
-
-Message reception:
-
-This example assumes some ring parameters of the ring setup are available.
-
-	unsigned int frame_offset = 0;
-	struct nl_mmap_hdr *hdr;
-	struct nlmsghdr *nlh;
-	unsigned char buf[16384];
-	ssize_t len;
-
-	while (1) {
-		struct pollfd pfds[1];
-
-		pfds[0].fd	= fd;
-		pfds[0].events	= POLLIN | POLLERR;
-		pfds[0].revents	= 0;
-
-		if (poll(pfds, 1, -1) < 0 && errno != -EINTR)
-			exit(1);
-
-		/* Check for errors. Error handling omitted */
-		if (pfds[0].revents & POLLERR)
-			<handle error>
-
-		/* If no new messages, poll again */
-		if (!(pfds[0].revents & POLLIN))
-			continue;
-
-		/* Process all frames */
-		while (1) {
-			/* Get next frame header */
-			hdr = rx_ring + frame_offset;
-
-			if (hdr->nm_status == NL_MMAP_STATUS_VALID) {
-				/* Regular memory mapped frame */
-				nlh = (void *)hdr + NL_MMAP_HDRLEN;
-				len = hdr->nm_len;
-
-				/* Release empty message immediately. May happen
-				 * on error during message construction.
-				 */
-				if (len == 0)
-					goto release;
-			} else if (hdr->nm_status == NL_MMAP_STATUS_COPY) {
-				/* Frame queued to socket receive queue */
-				len = recv(fd, buf, sizeof(buf), MSG_DONTWAIT);
-				if (len <= 0)
-					break;
-				nlh = buf;
-			} else
-				/* No more messages to process, continue polling */
-				break;
-
-			process_msg(nlh);
-release:
-			/* Release frame back to the kernel */
-			hdr->nm_status = NL_MMAP_STATUS_UNUSED;
-
-			/* Advance frame offset to next frame */
-			frame_offset = (frame_offset + frame_size) % ring_size;
-		}
-	}
-
-Message transmission:
-
-This example assumes some ring parameters of the ring setup are available.
-A single message is constructed and transmitted, to send multiple messages
-at once they would be constructed in consecutive frames before a final call
-to sendto().
-
-	unsigned int frame_offset = 0;
-	struct nl_mmap_hdr *hdr;
-	struct nlmsghdr *nlh;
-	struct sockaddr_nl addr = {
-		.nl_family	= AF_NETLINK,
-	};
-
-	hdr = tx_ring + frame_offset;
-	if (hdr->nm_status != NL_MMAP_STATUS_UNUSED)
-		/* No frame available. Use poll() to avoid. */
-		exit(1);
-
-	nlh = (void *)hdr + NL_MMAP_HDRLEN;
-
-	/* Build message */
-	build_message(nlh);
-
-	/* Fill frame header: length and status need to be set */
-	hdr->nm_len	= nlh->nlmsg_len;
-	hdr->nm_status	= NL_MMAP_STATUS_VALID;
-
-	if (sendto(fd, NULL, 0, 0, &addr, sizeof(addr)) < 0)
-		exit(1);
-
-	/* Advance frame offset to next frame */
-	frame_offset = (frame_offset + frame_size) % ring_size;
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index f095155d8749..0dba4e4ed2be 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -107,8 +107,10 @@ struct nlmsgerr {
 #define NETLINK_PKTINFO			3
 #define NETLINK_BROADCAST_ERROR		4
 #define NETLINK_NO_ENOBUFS		5
+#ifndef __KERNEL__
 #define NETLINK_RX_RING			6
 #define NETLINK_TX_RING			7
+#endif
 #define NETLINK_LISTEN_ALL_NSID		8
 #define NETLINK_LIST_MEMBERSHIPS	9
 #define NETLINK_CAP_ACK			10
@@ -134,6 +136,7 @@ struct nl_mmap_hdr {
 	__u32		nm_gid;
 };
 
+#ifndef __KERNEL__
 enum nl_mmap_status {
 	NL_MMAP_STATUS_UNUSED,
 	NL_MMAP_STATUS_RESERVED,
@@ -145,6 +148,7 @@ enum nl_mmap_status {
 #define NL_MMAP_MSG_ALIGNMENT		NLMSG_ALIGNTO
 #define NL_MMAP_MSG_ALIGN(sz)		__ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT)
 #define NL_MMAP_HDRLEN			NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr))
+#endif
 
 #define NET_MAJOR 36		/* Major 36 is reserved for networking 						*/
 
diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h
index f2159d30d1f5..d79399394b46 100644
--- a/include/uapi/linux/netlink_diag.h
+++ b/include/uapi/linux/netlink_diag.h
@@ -48,6 +48,8 @@ enum {
 
 #define NDIAG_SHOW_MEMINFO	0x00000001 /* show memory info of a socket */
 #define NDIAG_SHOW_GROUPS	0x00000002 /* show groups of a netlink socket */
+#ifndef __KERNEL__
 #define NDIAG_SHOW_RING_CFG	0x00000004 /* show ring configuration */
+#endif
 
 #endif
diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig
index 2c5e95e9bfbd..5d6e8c05b3d4 100644
--- a/net/netlink/Kconfig
+++ b/net/netlink/Kconfig
@@ -2,15 +2,6 @@
 # Netlink Sockets
 #
 
-config NETLINK_MMAP
-	bool "NETLINK: mmaped IO"
-	---help---
-	  This option enables support for memory mapped netlink IO. This
-	  reduces overhead by avoiding copying data between kernel- and
-	  userspace.
-
-	  If unsure, say N.
-
 config NETLINK_DIAG
 	tristate "NETLINK: socket monitoring interface"
 	default n
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f1ffb34e253f..85aa6ef86dfd 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -225,7 +225,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
 
 	dev_hold(dev);
 
-	if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head))
+	if (is_vmalloc_addr(skb->head))
 		nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
 	else
 		nskb = skb_clone(skb, GFP_ATOMIC);
@@ -300,610 +300,8 @@ static void netlink_rcv_wake(struct sock *sk)
 		wake_up_interruptible(&nlk->wait);
 }
 
-#ifdef CONFIG_NETLINK_MMAP
-static bool netlink_rx_is_mmaped(struct sock *sk)
-{
-	return nlk_sk(sk)->rx_ring.pg_vec != NULL;
-}
-
-static bool netlink_tx_is_mmaped(struct sock *sk)
-{
-	return nlk_sk(sk)->tx_ring.pg_vec != NULL;
-}
-
-static __pure struct page *pgvec_to_page(const void *addr)
-{
-	if (is_vmalloc_addr(addr))
-		return vmalloc_to_page(addr);
-	else
-		return virt_to_page(addr);
-}
-
-static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
-{
-	unsigned int i;
-
-	for (i = 0; i < len; i++) {
-		if (pg_vec[i] != NULL) {
-			if (is_vmalloc_addr(pg_vec[i]))
-				vfree(pg_vec[i]);
-			else
-				free_pages((unsigned long)pg_vec[i], order);
-		}
-	}
-	kfree(pg_vec);
-}
-
-static void *alloc_one_pg_vec_page(unsigned long order)
-{
-	void *buffer;
-	gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
-			  __GFP_NOWARN | __GFP_NORETRY;
-
-	buffer = (void *)__get_free_pages(gfp_flags, order);
-	if (buffer != NULL)
-		return buffer;
-
-	buffer = vzalloc((1 << order) * PAGE_SIZE);
-	if (buffer != NULL)
-		return buffer;
-
-	gfp_flags &= ~__GFP_NORETRY;
-	return (void *)__get_free_pages(gfp_flags, order);
-}
-
-static void **alloc_pg_vec(struct netlink_sock *nlk,
-			   struct nl_mmap_req *req, unsigned int order)
-{
-	unsigned int block_nr = req->nm_block_nr;
-	unsigned int i;
-	void **pg_vec;
-
-	pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
-	if (pg_vec == NULL)
-		return NULL;
-
-	for (i = 0; i < block_nr; i++) {
-		pg_vec[i] = alloc_one_pg_vec_page(order);
-		if (pg_vec[i] == NULL)
-			goto err1;
-	}
-
-	return pg_vec;
-err1:
-	free_pg_vec(pg_vec, order, block_nr);
-	return NULL;
-}
-
-
-static void
-__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec,
-		   unsigned int order)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct sk_buff_head *queue;
-	struct netlink_ring *ring;
-
-	queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
-	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
-
-	spin_lock_bh(&queue->lock);
-
-	ring->frame_max		= req->nm_frame_nr - 1;
-	ring->head		= 0;
-	ring->frame_size	= req->nm_frame_size;
-	ring->pg_vec_pages	= req->nm_block_size / PAGE_SIZE;
-
-	swap(ring->pg_vec_len, req->nm_block_nr);
-	swap(ring->pg_vec_order, order);
-	swap(ring->pg_vec, pg_vec);
-
-	__skb_queue_purge(queue);
-	spin_unlock_bh(&queue->lock);
-
-	WARN_ON(atomic_read(&nlk->mapped));
-
-	if (pg_vec)
-		free_pg_vec(pg_vec, order, req->nm_block_nr);
-}
-
-static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
-			    bool tx_ring)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	void **pg_vec = NULL;
-	unsigned int order = 0;
-
-	ring  = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
-
-	if (atomic_read(&nlk->mapped))
-		return -EBUSY;
-	if (atomic_read(&ring->pending))
-		return -EBUSY;
-
-	if (req->nm_block_nr) {
-		if (ring->pg_vec != NULL)
-			return -EBUSY;
-
-		if ((int)req->nm_block_size <= 0)
-			return -EINVAL;
-		if (!PAGE_ALIGNED(req->nm_block_size))
-			return -EINVAL;
-		if (req->nm_frame_size < NL_MMAP_HDRLEN)
-			return -EINVAL;
-		if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
-			return -EINVAL;
-
-		ring->frames_per_block = req->nm_block_size /
-					 req->nm_frame_size;
-		if (ring->frames_per_block == 0)
-			return -EINVAL;
-		if (ring->frames_per_block * req->nm_block_nr !=
-		    req->nm_frame_nr)
-			return -EINVAL;
-
-		order = get_order(req->nm_block_size);
-		pg_vec = alloc_pg_vec(nlk, req, order);
-		if (pg_vec == NULL)
-			return -ENOMEM;
-	} else {
-		if (req->nm_frame_nr)
-			return -EINVAL;
-	}
-
-	mutex_lock(&nlk->pg_vec_lock);
-	if (atomic_read(&nlk->mapped) == 0) {
-		__netlink_set_ring(sk, req, tx_ring, pg_vec, order);
-		mutex_unlock(&nlk->pg_vec_lock);
-		return 0;
-	}
-
-	mutex_unlock(&nlk->pg_vec_lock);
-
-	if (pg_vec)
-		free_pg_vec(pg_vec, order, req->nm_block_nr);
-
-	return -EBUSY;
-}
-
-static void netlink_mm_open(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct socket *sock = file->private_data;
-	struct sock *sk = sock->sk;
-
-	if (sk)
-		atomic_inc(&nlk_sk(sk)->mapped);
-}
-
-static void netlink_mm_close(struct vm_area_struct *vma)
-{
-	struct file *file = vma->vm_file;
-	struct socket *sock = file->private_data;
-	struct sock *sk = sock->sk;
-
-	if (sk)
-		atomic_dec(&nlk_sk(sk)->mapped);
-}
-
-static const struct vm_operations_struct netlink_mmap_ops = {
-	.open	= netlink_mm_open,
-	.close	= netlink_mm_close,
-};
-
-static int netlink_mmap(struct file *file, struct socket *sock,
-			struct vm_area_struct *vma)
-{
-	struct sock *sk = sock->sk;
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	unsigned long start, size, expected;
-	unsigned int i;
-	int err = -EINVAL;
-
-	if (vma->vm_pgoff)
-		return -EINVAL;
-
-	mutex_lock(&nlk->pg_vec_lock);
-
-	expected = 0;
-	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
-		if (ring->pg_vec == NULL)
-			continue;
-		expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
-	}
-
-	if (expected == 0)
-		goto out;
-
-	size = vma->vm_end - vma->vm_start;
-	if (size != expected)
-		goto out;
-
-	start = vma->vm_start;
-	for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
-		if (ring->pg_vec == NULL)
-			continue;
-
-		for (i = 0; i < ring->pg_vec_len; i++) {
-			struct page *page;
-			void *kaddr = ring->pg_vec[i];
-			unsigned int pg_num;
-
-			for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
-				page = pgvec_to_page(kaddr);
-				err = vm_insert_page(vma, start, page);
-				if (err < 0)
-					goto out;
-				start += PAGE_SIZE;
-				kaddr += PAGE_SIZE;
-			}
-		}
-	}
-
-	atomic_inc(&nlk->mapped);
-	vma->vm_ops = &netlink_mmap_ops;
-	err = 0;
-out:
-	mutex_unlock(&nlk->pg_vec_lock);
-	return err;
-}
-
-static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len)
-{
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
-	struct page *p_start, *p_end;
-
-	/* First page is flushed through netlink_{get,set}_status */
-	p_start = pgvec_to_page(hdr + PAGE_SIZE);
-	p_end   = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1);
-	while (p_start <= p_end) {
-		flush_dcache_page(p_start);
-		p_start++;
-	}
-#endif
-}
-
-static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
-{
-	smp_rmb();
-	flush_dcache_page(pgvec_to_page(hdr));
-	return hdr->nm_status;
-}
-
-static void netlink_set_status(struct nl_mmap_hdr *hdr,
-			       enum nl_mmap_status status)
-{
-	smp_mb();
-	hdr->nm_status = status;
-	flush_dcache_page(pgvec_to_page(hdr));
-}
-
-static struct nl_mmap_hdr *
-__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
-{
-	unsigned int pg_vec_pos, frame_off;
-
-	pg_vec_pos = pos / ring->frames_per_block;
-	frame_off  = pos % ring->frames_per_block;
-
-	return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
-}
-
-static struct nl_mmap_hdr *
-netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
-		     enum nl_mmap_status status)
-{
-	struct nl_mmap_hdr *hdr;
-
-	hdr = __netlink_lookup_frame(ring, pos);
-	if (netlink_get_status(hdr) != status)
-		return NULL;
-
-	return hdr;
-}
-
-static struct nl_mmap_hdr *
-netlink_current_frame(const struct netlink_ring *ring,
-		      enum nl_mmap_status status)
-{
-	return netlink_lookup_frame(ring, ring->head, status);
-}
-
-static void netlink_increment_head(struct netlink_ring *ring)
-{
-	ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
-}
-
-static void netlink_forward_ring(struct netlink_ring *ring)
-{
-	unsigned int head = ring->head;
-	const struct nl_mmap_hdr *hdr;
-
-	do {
-		hdr = __netlink_lookup_frame(ring, ring->head);
-		if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
-			break;
-		if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
-			break;
-		netlink_increment_head(ring);
-	} while (ring->head != head);
-}
-
-static bool netlink_has_valid_frame(struct netlink_ring *ring)
-{
-	unsigned int head = ring->head, pos = head;
-	const struct nl_mmap_hdr *hdr;
-
-	do {
-		hdr = __netlink_lookup_frame(ring, pos);
-		if (hdr->nm_status == NL_MMAP_STATUS_VALID)
-			return true;
-		pos = pos != 0 ? pos - 1 : ring->frame_max;
-	} while (pos != head);
-
-	return false;
-}
-
-static bool netlink_dump_space(struct netlink_sock *nlk)
-{
-	struct netlink_ring *ring = &nlk->rx_ring;
-	struct nl_mmap_hdr *hdr;
-	unsigned int n;
-
-	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
-	if (hdr == NULL)
-		return false;
-
-	n = ring->head + ring->frame_max / 2;
-	if (n > ring->frame_max)
-		n -= ring->frame_max;
-
-	hdr = __netlink_lookup_frame(ring, n);
-
-	return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
-}
-
-static unsigned int netlink_poll(struct file *file, struct socket *sock,
-				 poll_table *wait)
-{
-	struct sock *sk = sock->sk;
-	struct netlink_sock *nlk = nlk_sk(sk);
-	unsigned int mask;
-	int err;
-
-	if (nlk->rx_ring.pg_vec != NULL) {
-		/* Memory mapped sockets don't call recvmsg(), so flow control
-		 * for dumps is performed here. A dump is allowed to continue
-		 * if at least half the ring is unused.
-		 */
-		while (nlk->cb_running && netlink_dump_space(nlk)) {
-			err = netlink_dump(sk);
-			if (err < 0) {
-				sk->sk_err = -err;
-				sk->sk_error_report(sk);
-				break;
-			}
-		}
-		netlink_rcv_wake(sk);
-	}
-
-	mask = datagram_poll(file, sock, wait);
-
-	/* We could already have received frames in the normal receive
-	 * queue, that will show up as NL_MMAP_STATUS_COPY in the ring,
-	 * so if mask contains pollin/etc already, there's no point
-	 * walking the ring.
-	 */
-	if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) {
-		spin_lock_bh(&sk->sk_receive_queue.lock);
-		if (nlk->rx_ring.pg_vec) {
-			if (netlink_has_valid_frame(&nlk->rx_ring))
-				mask |= POLLIN | POLLRDNORM;
-		}
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-	}
-
-	spin_lock_bh(&sk->sk_write_queue.lock);
-	if (nlk->tx_ring.pg_vec) {
-		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
-			mask |= POLLOUT | POLLWRNORM;
-	}
-	spin_unlock_bh(&sk->sk_write_queue.lock);
-
-	return mask;
-}
-
-static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
-{
-	return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
-}
-
-static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
-				   struct netlink_ring *ring,
-				   struct nl_mmap_hdr *hdr)
-{
-	unsigned int size;
-	void *data;
-
-	size = ring->frame_size - NL_MMAP_HDRLEN;
-	data = (void *)hdr + NL_MMAP_HDRLEN;
-
-	skb->head	= data;
-	skb->data	= data;
-	skb_reset_tail_pointer(skb);
-	skb->end	= skb->tail + size;
-	skb->len	= 0;
-
-	skb->destructor	= netlink_skb_destructor;
-	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
-	NETLINK_CB(skb).sk = sk;
-}
-
-static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
-				u32 dst_portid, u32 dst_group,
-				struct scm_cookie *scm)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring;
-	struct nl_mmap_hdr *hdr;
-	struct sk_buff *skb;
-	unsigned int maxlen;
-	int err = 0, len = 0;
-
-	mutex_lock(&nlk->pg_vec_lock);
-
-	ring   = &nlk->tx_ring;
-	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-
-	do {
-		unsigned int nm_len;
-
-		hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
-		if (hdr == NULL) {
-			if (!(msg->msg_flags & MSG_DONTWAIT) &&
-			    atomic_read(&nlk->tx_ring.pending))
-				schedule();
-			continue;
-		}
-
-		nm_len = ACCESS_ONCE(hdr->nm_len);
-		if (nm_len > maxlen) {
-			err = -EINVAL;
-			goto out;
-		}
-
-		netlink_frame_flush_dcache(hdr, nm_len);
-
-		skb = alloc_skb(nm_len, GFP_KERNEL);
-		if (skb == NULL) {
-			err = -ENOBUFS;
-			goto out;
-		}
-		__skb_put(skb, nm_len);
-		memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len);
-		netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-
-		netlink_increment_head(ring);
-
-		NETLINK_CB(skb).portid	  = nlk->portid;
-		NETLINK_CB(skb).dst_group = dst_group;
-		NETLINK_CB(skb).creds	  = scm->creds;
-
-		err = security_netlink_send(sk, skb);
-		if (err) {
-			kfree_skb(skb);
-			goto out;
-		}
-
-		if (unlikely(dst_group)) {
-			atomic_inc(&skb->users);
-			netlink_broadcast(sk, skb, dst_portid, dst_group,
-					  GFP_KERNEL);
-		}
-		err = netlink_unicast(sk, skb, dst_portid,
-				      msg->msg_flags & MSG_DONTWAIT);
-		if (err < 0)
-			goto out;
-		len += err;
-
-	} while (hdr != NULL ||
-		 (!(msg->msg_flags & MSG_DONTWAIT) &&
-		  atomic_read(&nlk->tx_ring.pending)));
-
-	if (len > 0)
-		err = len;
-out:
-	mutex_unlock(&nlk->pg_vec_lock);
-	return err;
-}
-
-static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
-{
-	struct nl_mmap_hdr *hdr;
-
-	hdr = netlink_mmap_hdr(skb);
-	hdr->nm_len	= skb->len;
-	hdr->nm_group	= NETLINK_CB(skb).dst_group;
-	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
-	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
-	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
-	netlink_frame_flush_dcache(hdr, hdr->nm_len);
-	netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
-
-	NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
-	kfree_skb(skb);
-}
-
-static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	struct netlink_ring *ring = &nlk->rx_ring;
-	struct nl_mmap_hdr *hdr;
-
-	spin_lock_bh(&sk->sk_receive_queue.lock);
-	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
-	if (hdr == NULL) {
-		spin_unlock_bh(&sk->sk_receive_queue.lock);
-		kfree_skb(skb);
-		netlink_overrun(sk);
-		return;
-	}
-	netlink_increment_head(ring);
-	__skb_queue_tail(&sk->sk_receive_queue, skb);
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-
-	hdr->nm_len	= skb->len;
-	hdr->nm_group	= NETLINK_CB(skb).dst_group;
-	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
-	hdr->nm_uid	= from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
-	hdr->nm_gid	= from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
-	netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
-}
-
-#else /* CONFIG_NETLINK_MMAP */
-#define netlink_rx_is_mmaped(sk)	false
-#define netlink_tx_is_mmaped(sk)	false
-#define netlink_mmap			sock_no_mmap
-#define netlink_poll			datagram_poll
-#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm)	0
-#endif /* CONFIG_NETLINK_MMAP */
-
 static void netlink_skb_destructor(struct sk_buff *skb)
 {
-#ifdef CONFIG_NETLINK_MMAP
-	struct nl_mmap_hdr *hdr;
-	struct netlink_ring *ring;
-	struct sock *sk;
-
-	/* If a packet from the kernel to userspace was freed because of an
-	 * error without being delivered to userspace, the kernel must reset
-	 * the status. In the direction userspace to kernel, the status is
-	 * always reset here after the packet was processed and freed.
-	 */
-	if (netlink_skb_is_mmaped(skb)) {
-		hdr = netlink_mmap_hdr(skb);
-		sk = NETLINK_CB(skb).sk;
-
-		if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
-			netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-			ring = &nlk_sk(sk)->tx_ring;
-		} else {
-			if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
-				hdr->nm_len = 0;
-				netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
-			}
-			ring = &nlk_sk(sk)->rx_ring;
-		}
-
-		WARN_ON(atomic_read(&ring->pending) == 0);
-		atomic_dec(&ring->pending);
-		sock_put(sk);
-
-		skb->head = NULL;
-	}
-#endif
 	if (is_vmalloc_addr(skb->head)) {
 		if (!skb->cloned ||
 		    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
@@ -937,18 +335,6 @@ static void netlink_sock_destruct(struct sock *sk)
 	}
 
 	skb_queue_purge(&sk->sk_receive_queue);
-#ifdef CONFIG_NETLINK_MMAP
-	if (1) {
-		struct nl_mmap_req req;
-
-		memset(&req, 0, sizeof(req));
-		if (nlk->rx_ring.pg_vec)
-			__netlink_set_ring(sk, &req, false, NULL, 0);
-		memset(&req, 0, sizeof(req));
-		if (nlk->tx_ring.pg_vec)
-			__netlink_set_ring(sk, &req, true, NULL, 0);
-	}
-#endif /* CONFIG_NETLINK_MMAP */
 
 	if (!sock_flag(sk, SOCK_DEAD)) {
 		printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
@@ -1194,9 +580,6 @@ static int __netlink_create(struct net *net, struct socket *sock,
 		mutex_init(nlk->cb_mutex);
 	}
 	init_waitqueue_head(&nlk->wait);
-#ifdef CONFIG_NETLINK_MMAP
-	mutex_init(&nlk->pg_vec_lock);
-#endif
 
 	sk->sk_destruct = netlink_sock_destruct;
 	sk->sk_protocol = protocol;
@@ -1728,8 +1111,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
 	nlk = nlk_sk(sk);
 
 	if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-	     test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
-	    !netlink_skb_is_mmaped(skb)) {
+	     test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
 		DECLARE_WAITQUEUE(wait, current);
 		if (!*timeo) {
 			if (!ssk || netlink_is_kernel(ssk))
@@ -1767,14 +1149,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
 
 	netlink_deliver_tap(skb);
 
-#ifdef CONFIG_NETLINK_MMAP
-	if (netlink_skb_is_mmaped(skb))
-		netlink_queue_mmaped_skb(sk, skb);
-	else if (netlink_rx_is_mmaped(sk))
-		netlink_ring_set_copied(sk, skb);
-	else
-#endif /* CONFIG_NETLINK_MMAP */
-		skb_queue_tail(&sk->sk_receive_queue, skb);
+	skb_queue_tail(&sk->sk_receive_queue, skb);
 	sk->sk_data_ready(sk);
 	return len;
 }
@@ -1798,9 +1173,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
 	int delta;
 
 	WARN_ON(skb->sk != NULL);
-	if (netlink_skb_is_mmaped(skb))
-		return skb;
-
 	delta = skb->end - skb->tail;
 	if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
 		return skb;
@@ -1880,71 +1252,6 @@ struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
 				    unsigned int ldiff, u32 dst_portid,
 				    gfp_t gfp_mask)
 {
-#ifdef CONFIG_NETLINK_MMAP
-	unsigned int maxlen, linear_size;
-	struct sock *sk = NULL;
-	struct sk_buff *skb;
-	struct netlink_ring *ring;
-	struct nl_mmap_hdr *hdr;
-
-	sk = netlink_getsockbyportid(ssk, dst_portid);
-	if (IS_ERR(sk))
-		goto out;
-
-	ring = &nlk_sk(sk)->rx_ring;
-	/* fast-path without atomic ops for common case: non-mmaped receiver */
-	if (ring->pg_vec == NULL)
-		goto out_put;
-
-	/* We need to account the full linear size needed as a ring
-	 * slot cannot have non-linear parts.
-	 */
-	linear_size = size + ldiff;
-	if (ring->frame_size - NL_MMAP_HDRLEN < linear_size)
-		goto out_put;
-
-	skb = alloc_skb_head(gfp_mask);
-	if (skb == NULL)
-		goto err1;
-
-	spin_lock_bh(&sk->sk_receive_queue.lock);
-	/* check again under lock */
-	if (ring->pg_vec == NULL)
-		goto out_free;
-
-	/* check again under lock */
-	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-	if (maxlen < linear_size)
-		goto out_free;
-
-	netlink_forward_ring(ring);
-	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
-	if (hdr == NULL)
-		goto err2;
-
-	netlink_ring_setup_skb(skb, sk, ring, hdr);
-	netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
-	atomic_inc(&ring->pending);
-	netlink_increment_head(ring);
-
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-	return skb;
-
-err2:
-	kfree_skb(skb);
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-	netlink_overrun(sk);
-err1:
-	sock_put(sk);
-	return NULL;
-
-out_free:
-	kfree_skb(skb);
-	spin_unlock_bh(&sk->sk_receive_queue.lock);
-out_put:
-	sock_put(sk);
-out:
-#endif
 	return alloc_skb(size, gfp_mask);
 }
 EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
@@ -2225,8 +1532,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 	if (level != SOL_NETLINK)
 		return -ENOPROTOOPT;
 
-	if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
-	    optlen >= sizeof(int) &&
+	if (optlen >= sizeof(int) &&
 	    get_user(val, (unsigned int __user *)optval))
 		return -EFAULT;
 
@@ -2279,25 +1585,6 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		}
 		err = 0;
 		break;
-#ifdef CONFIG_NETLINK_MMAP
-	case NETLINK_RX_RING:
-	case NETLINK_TX_RING: {
-		struct nl_mmap_req req;
-
-		/* Rings might consume more memory than queue limits, require
-		 * CAP_NET_ADMIN.
-		 */
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
-		if (optlen < sizeof(req))
-			return -EINVAL;
-		if (copy_from_user(&req, optval, sizeof(req)))
-			return -EFAULT;
-		err = netlink_set_ring(sk, &req,
-				       optname == NETLINK_TX_RING);
-		break;
-	}
-#endif /* CONFIG_NETLINK_MMAP */
 	case NETLINK_LISTEN_ALL_NSID:
 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
 			return -EPERM;
@@ -2467,18 +1754,6 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
 		smp_rmb();
 	}
 
-	/* It's a really convoluted way for userland to ask for mmaped
-	 * sendmsg(), but that's what we've got...
-	 */
-	if (netlink_tx_is_mmaped(sk) &&
-	    iter_is_iovec(&msg->msg_iter) &&
-	    msg->msg_iter.nr_segs == 1 &&
-	    msg->msg_iter.iov->iov_base == NULL) {
-		err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
-					   &scm);
-		goto out;
-	}
-
 	err = -EMSGSIZE;
 	if (len > sk->sk_sndbuf - 32)
 		goto out;
@@ -2794,8 +2069,7 @@ static int netlink_dump(struct sock *sk)
 		goto errout_skb;
 	}
 
-	if (!netlink_rx_is_mmaped(sk) &&
-	    atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
 		goto errout_skb;
 
 	/* NLMSG_GOODSIZE is small to avoid high order allocations being
@@ -2831,8 +2105,7 @@ static int netlink_dump(struct sock *sk)
 	 * reasonable static buffer based on the expected largest dump of a
 	 * single netdev. The outcome is MSG_TRUNC error.
 	 */
-	if (!netlink_rx_is_mmaped(sk))
-		skb_reserve(skb, skb_tailroom(skb) - alloc_size);
+	skb_reserve(skb, skb_tailroom(skb) - alloc_size);
 	netlink_skb_set_owner_r(skb, sk);
 
 	len = cb->dump(skb, cb);
@@ -2884,16 +2157,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	struct netlink_sock *nlk;
 	int ret;
 
-	/* Memory mapped dump requests need to be copied to avoid looping
-	 * on the pending state in netlink_mmap_sendmsg() while the CB hold
-	 * a reference to the skb.
-	 */
-	if (netlink_skb_is_mmaped(skb)) {
-		skb = skb_copy(skb, GFP_KERNEL);
-		if (skb == NULL)
-			return -ENOBUFS;
-	} else
-		atomic_inc(&skb->users);
+	atomic_inc(&skb->users);
 
 	sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
 	if (sk == NULL) {
@@ -3241,7 +2505,7 @@ static const struct proto_ops netlink_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	netlink_getname,
-	.poll =		netlink_poll,
+	.poll =		datagram_poll,
 	.ioctl =	sock_no_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
@@ -3249,7 +2513,7 @@ static const struct proto_ops netlink_ops = {
 	.getsockopt =	netlink_getsockopt,
 	.sendmsg =	netlink_sendmsg,
 	.recvmsg =	netlink_recvmsg,
-	.mmap =		netlink_mmap,
+	.mmap =		sock_no_mmap,
 	.sendpage =	sock_no_sendpage,
 };
 
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 14437d9b1965..e68ef9ccd703 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -44,12 +44,6 @@ struct netlink_sock {
 	int			(*netlink_bind)(struct net *net, int group);
 	void			(*netlink_unbind)(struct net *net, int group);
 	struct module		*module;
-#ifdef CONFIG_NETLINK_MMAP
-	struct mutex		pg_vec_lock;
-	struct netlink_ring	rx_ring;
-	struct netlink_ring	tx_ring;
-	atomic_t		mapped;
-#endif /* CONFIG_NETLINK_MMAP */
 
 	struct rhash_head	node;
 	struct rcu_head		rcu;
@@ -60,15 +54,6 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk)
 	return container_of(sk, struct netlink_sock, sk);
 }
 
-static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb)
-{
-#ifdef CONFIG_NETLINK_MMAP
-	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
-#else
-	return false;
-#endif /* CONFIG_NETLINK_MMAP */
-}
-
 struct netlink_table {
 	struct rhashtable	hash;
 	struct hlist_head	mc_list;
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 3ee63a3cff30..8dd836a8dd60 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -8,41 +8,6 @@
 
 #include "af_netlink.h"
 
-#ifdef CONFIG_NETLINK_MMAP
-static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type,
-			    struct sk_buff *nlskb)
-{
-	struct netlink_diag_ring ndr;
-
-	ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
-	ndr.ndr_block_nr   = ring->pg_vec_len;
-	ndr.ndr_frame_size = ring->frame_size;
-	ndr.ndr_frame_nr   = ring->frame_max + 1;
-
-	return nla_put(nlskb, nl_type, sizeof(ndr), &ndr);
-}
-
-static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
-{
-	struct netlink_sock *nlk = nlk_sk(sk);
-	int ret;
-
-	mutex_lock(&nlk->pg_vec_lock);
-	ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb);
-	if (!ret)
-		ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING,
-				       nlskb);
-	mutex_unlock(&nlk->pg_vec_lock);
-
-	return ret;
-}
-#else
-static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
-{
-	return 0;
-}
-#endif
-
 static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
 {
 	struct netlink_sock *nlk = nlk_sk(sk);
@@ -87,10 +52,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 	    sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO))
 		goto out_nlmsg_trim;
 
-	if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) &&
-	    sk_diag_put_rings_cfg(sk, skb))
-		goto out_nlmsg_trim;
-
 	nlmsg_end(skb, nlh);
 	return 0;
 
-- 
cgit v1.2.3


From df4878e969ccc047da45d2cd3af5d08031da1593 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 12 Feb 2016 14:48:23 +0100
Subject: gpio: store reflect the label to userspace

The gpio_chip label is useful for userspace to understand what
kind of GPIO chip it is dealing with. Let's store a copy of this
label in the gpio_device, add it to the struct passed to userspace
for GPIO_GET_CHIPINFO_IOCTL and modify lsgpio to show it.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c      | 13 +++++++++++++
 drivers/gpio/gpiolib.h      |  3 +++
 include/linux/gpio/driver.h |  3 ++-
 include/uapi/linux/gpio.h   |  2 ++
 tools/gpio/lsgpio.c         |  4 ++--
 5 files changed, 22 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 59f0045c5950..797c790aa750 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -342,6 +342,9 @@ static long gpio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		strncpy(chipinfo.name, dev_name(&gdev->dev),
 			sizeof(chipinfo.name));
 		chipinfo.name[sizeof(chipinfo.name)-1] = '\0';
+		strncpy(chipinfo.label, gdev->label,
+			sizeof(chipinfo.label));
+		chipinfo.label[sizeof(chipinfo.label)-1] = '\0';
 		chipinfo.lines = gdev->ngpio;
 		if (copy_to_user(ip, &chipinfo, sizeof(chipinfo)))
 			return -EFAULT;
@@ -479,6 +482,16 @@ int gpiochip_add_data(struct gpio_chip *chip, void *data)
 		status = -EINVAL;
 		goto err_free_gdev;
 	}
+
+	if (chip->label)
+		gdev->label = devm_kstrdup(&gdev->dev, chip->label, GFP_KERNEL);
+	else
+		gdev->label = devm_kstrdup(&gdev->dev, "unknown", GFP_KERNEL);
+	if (!gdev->label) {
+		status = -ENOMEM;
+		goto err_free_gdev;
+	}
+
 	gdev->ngpio = chip->ngpio;
 	gdev->data = data;
 
diff --git a/drivers/gpio/gpiolib.h b/drivers/gpio/gpiolib.h
index ddbe409ad48f..e30e5fdb1214 100644
--- a/drivers/gpio/gpiolib.h
+++ b/drivers/gpio/gpiolib.h
@@ -37,6 +37,8 @@ struct acpi_device;
  * of the @descs array.
  * @base: GPIO base in the DEPRECATED global Linux GPIO numberspace, assigned
  * at device creation time.
+ * @label: a descriptive name for the GPIO device, such as the part number
+ * or name of the IP component in a System on Chip.
  * @data: per-instance data assigned by the driver
  * @list: links gpio_device:s together for traversal
  *
@@ -55,6 +57,7 @@ struct gpio_device {
 	struct gpio_desc	*descs;
 	int			base;
 	u16			ngpio;
+	char			*label;
 	void			*data;
 	struct list_head        list;
 
diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h
index ff96d0f9fceb..639607658ed8 100644
--- a/include/linux/gpio/driver.h
+++ b/include/linux/gpio/driver.h
@@ -21,7 +21,8 @@ struct gpio_device;
 
 /**
  * struct gpio_chip - abstract a GPIO controller
- * @label: for diagnostics
+ * @label: a functional name for the GPIO device, such as a part
+ *	number or the name of the SoC IP-block implementing it.
  * @gpiodev: the internal state holder, opaque struct
  * @parent: optional parent device providing the GPIOs
  * @owner: helps prevent removal of modules exporting active GPIOs
diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h
index 3188a87bdaa0..3f93e1bcd3dd 100644
--- a/include/uapi/linux/gpio.h
+++ b/include/uapi/linux/gpio.h
@@ -16,10 +16,12 @@
 /**
  * struct gpiochip_info - Information about a certain GPIO chip
  * @name: the name of this GPIO chip
+ * @label: a functional name for this GPIO chip
  * @lines: number of GPIO lines on this chip
  */
 struct gpiochip_info {
 	char name[32];
+	char label[32];
 	__u32 lines;
 };
 
diff --git a/tools/gpio/lsgpio.c b/tools/gpio/lsgpio.c
index 4cfe29da279b..692233f561fb 100644
--- a/tools/gpio/lsgpio.c
+++ b/tools/gpio/lsgpio.c
@@ -54,8 +54,8 @@ int list_device(const char *device_name)
 
 		goto free_chrdev_name;
 	}
-	fprintf(stdout, "GPIO chip: %s, %u GPIO lines\n",
-		cinfo.name, cinfo.lines);
+	fprintf(stdout, "GPIO chip: %s, \"%s\", %u GPIO lines\n",
+		cinfo.name, cinfo.label, cinfo.lines);
 
 	if (close(fd) == -1)  {
 		ret = -errno;
-- 
cgit v1.2.3


From 521a2ad6f862a28e2e43cb3e254a26bf0f9452e9 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Fri, 12 Feb 2016 22:25:22 +0100
Subject: gpio: add userspace ABI for GPIO line information

This adds a GPIO line ABI for getting name, label and a few select
flags from the kernel.

This hides the kernel internals and only tells userspace what it
may need to know: the different in-kernel consumers are masked
behind the flag "kernel" and that is all userspace needs to know.

However electric characteristics like active low, open drain etc
are reflected to userspace, as this is important information.

We provide information on all lines on all chips, later on we will
likely add a flag for the chardev consumer so we can filter and
display only the lines userspace actually uses in e.g. lsgpio,
but then we first need an ABI for userspace to grab and use
(get/set/select direction) a GPIO line.

Sample output from "lsgpio" on ux500:

GPIO chip: gpiochip7, "8011e000.gpio", 32 GPIO lines
        line 0: unnamed unlabeled
        line 1: unnamed unlabeled
(...)
        line 25: unnamed "SFH7741 Proximity Sensor" [kernel output open-drain]
        line 26: unnamed unlabeled
(...)

Tested-by: Michael Welling <mwelling@ieee.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c    | 51 ++++++++++++++++++++++++--
 include/uapi/linux/gpio.h | 26 ++++++++++++++
 tools/gpio/gpio-utils.h   |  2 ++
 tools/gpio/lsgpio.c       | 91 ++++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 156 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 797c790aa750..3580c0de9d5a 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -331,14 +331,15 @@ static long gpio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 	struct gpio_device *gdev = filp->private_data;
 	struct gpio_chip *chip = gdev->chip;
 	int __user *ip = (int __user *)arg;
-	struct gpiochip_info chipinfo;
 
 	/* We fail any subsequent ioctl():s when the chip is gone */
 	if (!chip)
 		return -ENODEV;
 
+	/* Fill in the struct and pass to userspace */
 	if (cmd == GPIO_GET_CHIPINFO_IOCTL) {
-		/* Fill in the struct and pass to userspace */
+		struct gpiochip_info chipinfo;
+
 		strncpy(chipinfo.name, dev_name(&gdev->dev),
 			sizeof(chipinfo.name));
 		chipinfo.name[sizeof(chipinfo.name)-1] = '\0';
@@ -349,6 +350,52 @@ static long gpio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (copy_to_user(ip, &chipinfo, sizeof(chipinfo)))
 			return -EFAULT;
 		return 0;
+	} else if (cmd == GPIO_GET_LINEINFO_IOCTL) {
+		struct gpioline_info lineinfo;
+		struct gpio_desc *desc;
+
+		if (copy_from_user(&lineinfo, ip, sizeof(lineinfo)))
+			return -EFAULT;
+		if (lineinfo.line_offset > gdev->ngpio)
+			return -EINVAL;
+
+		desc = &gdev->descs[lineinfo.line_offset];
+		if (desc->name) {
+			strncpy(lineinfo.name, desc->name,
+				sizeof(lineinfo.name));
+			lineinfo.name[sizeof(lineinfo.name)-1] = '\0';
+		} else {
+			lineinfo.name[0] = '\0';
+		}
+		if (desc->label) {
+			strncpy(lineinfo.label, desc->label,
+				sizeof(lineinfo.label));
+			lineinfo.label[sizeof(lineinfo.label)-1] = '\0';
+		} else {
+			lineinfo.label[0] = '\0';
+		}
+
+		/*
+		 * Userspace only need to know that the kernel is using
+		 * this GPIO so it can't use it.
+		 */
+		lineinfo.flags = 0;
+		if (desc->flags & (FLAG_REQUESTED | FLAG_IS_HOGGED |
+				   FLAG_USED_AS_IRQ | FLAG_EXPORT |
+				   FLAG_SYSFS))
+			lineinfo.flags |= GPIOLINE_FLAG_KERNEL;
+		if (desc->flags & FLAG_IS_OUT)
+			lineinfo.flags |= GPIOLINE_FLAG_IS_OUT;
+		if (desc->flags & FLAG_ACTIVE_LOW)
+			lineinfo.flags |= GPIOLINE_FLAG_ACTIVE_LOW;
+		if (desc->flags & FLAG_OPEN_DRAIN)
+			lineinfo.flags |= GPIOLINE_FLAG_OPEN_DRAIN;
+		if (desc->flags & FLAG_OPEN_SOURCE)
+			lineinfo.flags |= GPIOLINE_FLAG_OPEN_SOURCE;
+
+		if (copy_to_user(ip, &lineinfo, sizeof(lineinfo)))
+			return -EFAULT;
+		return 0;
 	}
 	return -EINVAL;
 }
diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h
index 3f93e1bcd3dd..416ce47f2291 100644
--- a/include/uapi/linux/gpio.h
+++ b/include/uapi/linux/gpio.h
@@ -25,6 +25,32 @@ struct gpiochip_info {
 	__u32 lines;
 };
 
+/* Line is in use by the kernel */
+#define GPIOLINE_FLAG_KERNEL		(1UL << 0)
+#define GPIOLINE_FLAG_IS_OUT		(1UL << 1)
+#define GPIOLINE_FLAG_ACTIVE_LOW	(1UL << 2)
+#define GPIOLINE_FLAG_OPEN_DRAIN	(1UL << 3)
+#define GPIOLINE_FLAG_OPEN_SOURCE	(1UL << 4)
+
+/**
+ * struct gpioline_info - Information about a certain GPIO line
+ * @line_offset: the local offset on this GPIO device, fill in when
+ * requesting information from the kernel
+ * @flags: various flags for this line
+ * @name: the name of this GPIO line
+ * @label: a functional name for this GPIO line
+ * @kernel: this GPIO is in use by the kernel
+ * @out: this GPIO is an output line (false means it is an input)
+ * @active_low: this GPIO is active low
+ */
+struct gpioline_info {
+	__u32 line_offset;
+	__u32 flags;
+	char name[32];
+	char label[32];
+};
+
 #define GPIO_GET_CHIPINFO_IOCTL _IOR('o', 0x01, struct gpiochip_info)
+#define GPIO_GET_LINEINFO_IOCTL _IOWR('o', 0x02, struct gpioline_info)
 
 #endif /* _UAPI_GPIO_H_ */
diff --git a/tools/gpio/gpio-utils.h b/tools/gpio/gpio-utils.h
index b18209a45ad3..5f57133b8c04 100644
--- a/tools/gpio/gpio-utils.h
+++ b/tools/gpio/gpio-utils.h
@@ -16,6 +16,8 @@
 
 #include <string.h>
 
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
 static inline int check_prefix(const char *str, const char *prefix)
 {
 	return strlen(str) > strlen(prefix) &&
diff --git a/tools/gpio/lsgpio.c b/tools/gpio/lsgpio.c
index 692233f561fb..5535ce81f8f7 100644
--- a/tools/gpio/lsgpio.c
+++ b/tools/gpio/lsgpio.c
@@ -26,12 +26,56 @@
 
 #include "gpio-utils.h"
 
+struct gpio_flag {
+	char *name;
+	unsigned long mask;
+};
+
+struct gpio_flag flagnames[] = {
+	{
+		.name = "kernel",
+		.mask = GPIOLINE_FLAG_KERNEL,
+	},
+	{
+		.name = "output",
+		.mask = GPIOLINE_FLAG_IS_OUT,
+	},
+	{
+		.name = "active-low",
+		.mask = GPIOLINE_FLAG_ACTIVE_LOW,
+	},
+	{
+		.name = "open-drain",
+		.mask = GPIOLINE_FLAG_OPEN_DRAIN,
+	},
+	{
+		.name = "open-source",
+		.mask = GPIOLINE_FLAG_OPEN_SOURCE,
+	},
+};
+
+void print_flags(unsigned long flags)
+{
+	int i;
+	int printed = 0;
+
+	for (i = 0; i < ARRAY_SIZE(flagnames); i++) {
+		if (flags & flagnames[i].mask) {
+			if (printed)
+				fprintf(stdout, " ");
+			fprintf(stdout, "%s", flagnames[i].name);
+			printed++;
+		}
+	}
+}
+
 int list_device(const char *device_name)
 {
 	struct gpiochip_info cinfo;
 	char *chrdev_name;
 	int fd;
 	int ret;
+	int i;
 
 	ret = asprintf(&chrdev_name, "/dev/%s", device_name);
 	if (ret < 0)
@@ -41,32 +85,55 @@ int list_device(const char *device_name)
 	if (fd == -1) {
 		ret = -errno;
 		fprintf(stderr, "Failed to open %s\n", chrdev_name);
-		goto free_chrdev_name;
+		goto exit_close_error;
 	}
 
 	/* Inspect this GPIO chip */
 	ret = ioctl(fd, GPIO_GET_CHIPINFO_IOCTL, &cinfo);
 	if (ret == -1) {
 		ret = -errno;
-		fprintf(stderr, "Failed to retrieve GPIO fd\n");
-		if (close(fd) == -1)
-			perror("Failed to close GPIO character device file");
-
-		goto free_chrdev_name;
+		perror("Failed to issue CHIPINFO IOCTL\n");
+		goto exit_close_error;
 	}
 	fprintf(stdout, "GPIO chip: %s, \"%s\", %u GPIO lines\n",
 		cinfo.name, cinfo.label, cinfo.lines);
 
-	if (close(fd) == -1)  {
-		ret = -errno;
-		goto free_chrdev_name;
+	/* Loop over the lines and print info */
+	for (i = 0; i < cinfo.lines; i++) {
+		struct gpioline_info linfo;
+
+		memset(&linfo, 0, sizeof(linfo));
+		linfo.line_offset = i;
+
+		ret = ioctl(fd, GPIO_GET_LINEINFO_IOCTL, &linfo);
+		if (ret == -1) {
+			ret = -errno;
+			perror("Failed to issue LINEINFO IOCTL\n");
+			goto exit_close_error;
+		}
+		fprintf(stdout, "\tline %d:", linfo.line_offset);
+		if (linfo.name[0])
+			fprintf(stdout, " %s", linfo.name);
+		else
+			fprintf(stdout, " unnamed");
+		if (linfo.label[0])
+			fprintf(stdout, " \"%s\"", linfo.label);
+		else
+			fprintf(stdout, " unlabeled");
+		if (linfo.flags) {
+			fprintf(stdout, " [");
+			print_flags(linfo.flags);
+			fprintf(stdout, "]");
+		}
+		fprintf(stdout, "\n");
+
 	}
 
-free_chrdev_name:
+exit_close_error:
+	if (close(fd) == -1)
+		perror("Failed to close GPIO character device file");
 	free(chrdev_name);
-
 	return ret;
-
 }
 
 void print_usage(void)
-- 
cgit v1.2.3


From cedc12108b55d0c571da883a4083977af612dcce Mon Sep 17 00:00:00 2001
From: Wu-Cheng Li <wuchengli@chromium.org>
Date: Tue, 19 Jan 2016 05:07:09 -0200
Subject: [media] v4l: add V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME

Some drivers also need a control like
V4L2_CID_MPEG_MFC51_VIDEO_FORCE_FRAME_TYPE to force an encoder
key frame. Add a general V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
so the new drivers and applications can use it.

Signed-off-by: Wu-Cheng Li <wuchengli@chromium.org>
Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 Documentation/DocBook/media/v4l/controls.xml | 8 ++++++++
 drivers/media/v4l2-core/v4l2-ctrls.c         | 2 ++
 include/uapi/linux/v4l2-controls.h           | 1 +
 3 files changed, 11 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/Documentation/DocBook/media/v4l/controls.xml b/Documentation/DocBook/media/v4l/controls.xml
index 7a771492f2a5..361040e6b0f4 100644
--- a/Documentation/DocBook/media/v4l/controls.xml
+++ b/Documentation/DocBook/media/v4l/controls.xml
@@ -2329,6 +2329,14 @@ to search and match for the present Macroblock (MB) in the reference picture. Th
 vertical search range for motion estimation module in video encoder.</entry>
 	      </row>
 
+	      <row><entry></entry></row>
+	      <row id="v4l2-mpeg-video-force-key-frame">
+		<entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME</constant>&nbsp;</entry>
+		<entry>button</entry>
+	      </row><row><entry spanname="descr">Force a key frame for the next queued buffer. Applicable to encoders.
+This is a general, codec-agnostic keyframe control.</entry>
+	      </row>
+
 	      <row><entry></entry></row>
 	      <row>
 		<entry spanname="id"><constant>V4L2_CID_MPEG_VIDEO_H264_CPB_SIZE</constant>&nbsp;</entry>
diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c
index 130e5badd599..8b321e0aae62 100644
--- a/drivers/media/v4l2-core/v4l2-ctrls.c
+++ b/drivers/media/v4l2-core/v4l2-ctrls.c
@@ -758,6 +758,7 @@ const char *v4l2_ctrl_get_name(u32 id)
 	case V4L2_CID_MPEG_VIDEO_MV_H_SEARCH_RANGE:		return "Horizontal MV Search Range";
 	case V4L2_CID_MPEG_VIDEO_MV_V_SEARCH_RANGE:		return "Vertical MV Search Range";
 	case V4L2_CID_MPEG_VIDEO_REPEAT_SEQ_HEADER:		return "Repeat Sequence Header";
+	case V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME:		return "Force Key Frame";
 
 	/* VPX controls */
 	case V4L2_CID_MPEG_VIDEO_VPX_NUM_PARTITIONS:		return "VPX Number of Partitions";
@@ -998,6 +999,7 @@ void v4l2_ctrl_fill(u32 id, const char **name, enum v4l2_ctrl_type *type,
 	case V4L2_CID_MPEG_VIDEO_MV_V_SEARCH_RANGE:
 		*type = V4L2_CTRL_TYPE_INTEGER;
 		break;
+	case V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME:
 	case V4L2_CID_PAN_RESET:
 	case V4L2_CID_TILT_RESET:
 	case V4L2_CID_FLASH_STROBE:
diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h
index 2ae5c3ea179b..b6a357a5f053 100644
--- a/include/uapi/linux/v4l2-controls.h
+++ b/include/uapi/linux/v4l2-controls.h
@@ -390,6 +390,7 @@ enum v4l2_mpeg_video_multi_slice_mode {
 #define V4L2_CID_MPEG_VIDEO_REPEAT_SEQ_HEADER		(V4L2_CID_MPEG_BASE+226)
 #define V4L2_CID_MPEG_VIDEO_MV_H_SEARCH_RANGE		(V4L2_CID_MPEG_BASE+227)
 #define V4L2_CID_MPEG_VIDEO_MV_V_SEARCH_RANGE		(V4L2_CID_MPEG_BASE+228)
+#define V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME		(V4L2_CID_MPEG_BASE+229)
 
 #define V4L2_CID_MPEG_VIDEO_H263_I_FRAME_QP		(V4L2_CID_MPEG_BASE+300)
 #define V4L2_CID_MPEG_VIDEO_H263_P_FRAME_QP		(V4L2_CID_MPEG_BASE+301)
-- 
cgit v1.2.3


From d65fae92f9a2862d73293cab6c427c40cff71f70 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Wed, 11 Nov 2015 22:02:00 -0200
Subject: [media] v4l: Add YUV 4:2:2 and YUV 4:4:4 tri-planar non-contiguous
 formats

The formats use three planes through the multiplanar API, allowing for
non-contiguous planes in memory.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Acked-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 Documentation/DocBook/media/v4l/pixfmt-yuv422m.xml | 166 +++++++++++++++++++
 Documentation/DocBook/media/v4l/pixfmt-yuv444m.xml | 177 +++++++++++++++++++++
 Documentation/DocBook/media/v4l/pixfmt.xml         |   2 +
 drivers/media/v4l2-core/v4l2-ioctl.c               |   4 +
 include/uapi/linux/videodev2.h                     |   4 +
 5 files changed, 353 insertions(+)
 create mode 100644 Documentation/DocBook/media/v4l/pixfmt-yuv422m.xml
 create mode 100644 Documentation/DocBook/media/v4l/pixfmt-yuv444m.xml

(limited to 'include/uapi/linux')

diff --git a/Documentation/DocBook/media/v4l/pixfmt-yuv422m.xml b/Documentation/DocBook/media/v4l/pixfmt-yuv422m.xml
new file mode 100644
index 000000000000..dd502802cb75
--- /dev/null
+++ b/Documentation/DocBook/media/v4l/pixfmt-yuv422m.xml
@@ -0,0 +1,166 @@
+    <refentry>
+      <refmeta>
+	<refentrytitle>V4L2_PIX_FMT_YUV422M ('YM16'), V4L2_PIX_FMT_YVU422M ('YM61')</refentrytitle>
+	&manvol;
+      </refmeta>
+      <refnamediv>
+	<refname id="V4L2-PIX-FMT-YUV422M"><constant>V4L2_PIX_FMT_YUV422M</constant></refname>
+	<refname id="V4L2-PIX-FMT-YVU422M"><constant>V4L2_PIX_FMT_YVU422M</constant></refname>
+	<refpurpose>Planar formats with &frac12; horizontal resolution, also
+	known as YUV and YVU 4:2:2</refpurpose>
+      </refnamediv>
+
+      <refsect1>
+	<title>Description</title>
+
+	<para>This is a multi-planar format, as opposed to a packed format.
+The three components are separated into three sub-images or planes.</para>
+
+	<para>The Y plane is first. The Y plane has one byte per pixel.
+For <constant>V4L2_PIX_FMT_YUV422M</constant> the Cb data
+constitutes the second plane which is half the width of the Y plane (and of the
+image). Each Cb belongs to two pixels. For example,
+Cb<subscript>0</subscript> belongs to Y'<subscript>00</subscript>,
+Y'<subscript>01</subscript>. The Cr data, just like the Cb plane, is
+in the third plane. </para>
+
+	<para><constant>V4L2_PIX_FMT_YVU422M</constant> is the same except
+the Cr data is stored in the second plane and the Cb data in the third plane.
+</para>
+
+	<para>If the Y plane has pad bytes after each row, then the Cb
+and Cr planes have half as many pad bytes after their rows. In other
+words, two Cx rows (including padding) is exactly as long as one Y row
+(including padding).</para>
+
+	<para><constant>V4L2_PIX_FMT_YUV422M</constant> and
+<constant>V4L2_PIX_FMT_YVU422M</constant> are intended to be
+used only in drivers and applications that support the multi-planar API,
+described in <xref linkend="planar-apis"/>. </para>
+
+	<example>
+	  <title><constant>V4L2_PIX_FMT_YUV422M</constant> 4 &times; 4
+pixel image</title>
+
+	  <formalpara>
+	    <title>Byte Order.</title>
+	    <para>Each cell is one byte.
+		<informaltable frame="none">
+		<tgroup cols="5" align="center">
+		  <colspec align="left" colwidth="2*" />
+		  <tbody valign="top">
+		    <row>
+		      <entry>start0&nbsp;+&nbsp;0:</entry>
+		      <entry>Y'<subscript>00</subscript></entry>
+		      <entry>Y'<subscript>01</subscript></entry>
+		      <entry>Y'<subscript>02</subscript></entry>
+		      <entry>Y'<subscript>03</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start0&nbsp;+&nbsp;4:</entry>
+		      <entry>Y'<subscript>10</subscript></entry>
+		      <entry>Y'<subscript>11</subscript></entry>
+		      <entry>Y'<subscript>12</subscript></entry>
+		      <entry>Y'<subscript>13</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start0&nbsp;+&nbsp;8:</entry>
+		      <entry>Y'<subscript>20</subscript></entry>
+		      <entry>Y'<subscript>21</subscript></entry>
+		      <entry>Y'<subscript>22</subscript></entry>
+		      <entry>Y'<subscript>23</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start0&nbsp;+&nbsp;12:</entry>
+		      <entry>Y'<subscript>30</subscript></entry>
+		      <entry>Y'<subscript>31</subscript></entry>
+		      <entry>Y'<subscript>32</subscript></entry>
+		      <entry>Y'<subscript>33</subscript></entry>
+		    </row>
+		    <row><entry></entry></row>
+		    <row>
+		      <entry>start1&nbsp;+&nbsp;0:</entry>
+		      <entry>Cb<subscript>00</subscript></entry>
+		      <entry>Cb<subscript>01</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start1&nbsp;+&nbsp;2:</entry>
+		      <entry>Cb<subscript>10</subscript></entry>
+		      <entry>Cb<subscript>11</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start1&nbsp;+&nbsp;4:</entry>
+		      <entry>Cb<subscript>20</subscript></entry>
+		      <entry>Cb<subscript>21</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start1&nbsp;+&nbsp;6:</entry>
+		      <entry>Cb<subscript>30</subscript></entry>
+		      <entry>Cb<subscript>31</subscript></entry>
+		    </row>
+		    <row><entry></entry></row>
+		    <row>
+		      <entry>start2&nbsp;+&nbsp;0:</entry>
+		      <entry>Cr<subscript>00</subscript></entry>
+		      <entry>Cr<subscript>01</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start2&nbsp;+&nbsp;2:</entry>
+		      <entry>Cr<subscript>10</subscript></entry>
+		      <entry>Cr<subscript>11</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start2&nbsp;+&nbsp;4:</entry>
+		      <entry>Cr<subscript>20</subscript></entry>
+		      <entry>Cr<subscript>21</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start2&nbsp;+&nbsp;6:</entry>
+		      <entry>Cr<subscript>30</subscript></entry>
+		      <entry>Cr<subscript>31</subscript></entry>
+		    </row>
+		  </tbody>
+		</tgroup>
+		</informaltable>
+	      </para>
+	  </formalpara>
+
+	  <formalpara>
+	    <title>Color Sample Location.</title>
+	    <para>
+		<informaltable frame="none">
+		<tgroup cols="7" align="center">
+		  <tbody valign="top">
+		    <row>
+		      <entry></entry>
+		      <entry>0</entry><entry></entry><entry>1</entry><entry></entry>
+		      <entry>2</entry><entry></entry><entry>3</entry>
+		    </row>
+		    <row>
+		      <entry>0</entry>
+		      <entry>Y</entry><entry>C</entry><entry>Y</entry><entry></entry>
+		      <entry>Y</entry><entry>C</entry><entry>Y</entry>
+		    </row>
+		    <row>
+		      <entry>1</entry>
+		      <entry>Y</entry><entry>C</entry><entry>Y</entry><entry></entry>
+		      <entry>Y</entry><entry>C</entry><entry>Y</entry>
+		    </row>
+		    <row>
+		      <entry>2</entry>
+		      <entry>Y</entry><entry>C</entry><entry>Y</entry><entry></entry>
+		      <entry>Y</entry><entry>C</entry><entry>Y</entry>
+		    </row>
+		    <row>
+		      <entry>3</entry>
+		      <entry>Y</entry><entry>C</entry><entry>Y</entry><entry></entry>
+		      <entry>Y</entry><entry>C</entry><entry>Y</entry>
+		    </row>
+		  </tbody>
+		</tgroup>
+		</informaltable>
+	      </para>
+	  </formalpara>
+	</example>
+      </refsect1>
+    </refentry>
diff --git a/Documentation/DocBook/media/v4l/pixfmt-yuv444m.xml b/Documentation/DocBook/media/v4l/pixfmt-yuv444m.xml
new file mode 100644
index 000000000000..1b7335940bc7
--- /dev/null
+++ b/Documentation/DocBook/media/v4l/pixfmt-yuv444m.xml
@@ -0,0 +1,177 @@
+    <refentry>
+      <refmeta>
+	<refentrytitle>V4L2_PIX_FMT_YUV444M ('YM24'), V4L2_PIX_FMT_YVU444M ('YM42')</refentrytitle>
+	&manvol;
+      </refmeta>
+      <refnamediv>
+	<refname id="V4L2-PIX-FMT-YUV444M"><constant>V4L2_PIX_FMT_YUV444M</constant></refname>
+	<refname id="V4L2-PIX-FMT-YVU444M"><constant>V4L2_PIX_FMT_YVU444M</constant></refname>
+	<refpurpose>Planar formats with full horizontal resolution, also
+	known as YUV and YVU 4:4:4</refpurpose>
+      </refnamediv>
+
+      <refsect1>
+	<title>Description</title>
+
+	<para>This is a multi-planar format, as opposed to a packed format.
+The three components are separated into three sub-images or planes.</para>
+
+	<para>The Y plane is first. The Y plane has one byte per pixel.
+For <constant>V4L2_PIX_FMT_YUV444M</constant> the Cb data
+constitutes the second plane which is the same width and height as the Y plane
+(and as the image). The Cr data, just like the Cb plane, is in the third plane.
+</para>
+
+	<para><constant>V4L2_PIX_FMT_YVU444M</constant> is the same except
+the Cr data is stored in the second plane and the Cb data in the third plane.
+</para>
+	<para>If the Y plane has pad bytes after each row, then the Cb
+and Cr planes have the same number of pad bytes after their rows.</para>
+
+	<para><constant>V4L2_PIX_FMT_YUV444M</constant> and
+<constant>V4L2_PIX_FMT_YUV444M</constant> are intended to be
+used only in drivers and applications that support the multi-planar API,
+described in <xref linkend="planar-apis"/>. </para>
+
+	<example>
+	  <title><constant>V4L2_PIX_FMT_YUV444M</constant> 4 &times; 4
+pixel image</title>
+
+	  <formalpara>
+	    <title>Byte Order.</title>
+	    <para>Each cell is one byte.
+		<informaltable frame="none">
+		<tgroup cols="5" align="center">
+		  <colspec align="left" colwidth="2*" />
+		  <tbody valign="top">
+		    <row>
+		      <entry>start0&nbsp;+&nbsp;0:</entry>
+		      <entry>Y'<subscript>00</subscript></entry>
+		      <entry>Y'<subscript>01</subscript></entry>
+		      <entry>Y'<subscript>02</subscript></entry>
+		      <entry>Y'<subscript>03</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start0&nbsp;+&nbsp;4:</entry>
+		      <entry>Y'<subscript>10</subscript></entry>
+		      <entry>Y'<subscript>11</subscript></entry>
+		      <entry>Y'<subscript>12</subscript></entry>
+		      <entry>Y'<subscript>13</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start0&nbsp;+&nbsp;8:</entry>
+		      <entry>Y'<subscript>20</subscript></entry>
+		      <entry>Y'<subscript>21</subscript></entry>
+		      <entry>Y'<subscript>22</subscript></entry>
+		      <entry>Y'<subscript>23</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start0&nbsp;+&nbsp;12:</entry>
+		      <entry>Y'<subscript>30</subscript></entry>
+		      <entry>Y'<subscript>31</subscript></entry>
+		      <entry>Y'<subscript>32</subscript></entry>
+		      <entry>Y'<subscript>33</subscript></entry>
+		    </row>
+		    <row><entry></entry></row>
+		    <row>
+		      <entry>start1&nbsp;+&nbsp;0:</entry>
+		      <entry>Cb<subscript>00</subscript></entry>
+		      <entry>Cb<subscript>01</subscript></entry>
+		      <entry>Cb<subscript>02</subscript></entry>
+		      <entry>Cb<subscript>03</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start1&nbsp;+&nbsp;4:</entry>
+		      <entry>Cb<subscript>10</subscript></entry>
+		      <entry>Cb<subscript>11</subscript></entry>
+		      <entry>Cb<subscript>12</subscript></entry>
+		      <entry>Cb<subscript>13</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start1&nbsp;+&nbsp;8:</entry>
+		      <entry>Cb<subscript>20</subscript></entry>
+		      <entry>Cb<subscript>21</subscript></entry>
+		      <entry>Cb<subscript>22</subscript></entry>
+		      <entry>Cb<subscript>23</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start1&nbsp;+&nbsp;12:</entry>
+		      <entry>Cb<subscript>20</subscript></entry>
+		      <entry>Cb<subscript>21</subscript></entry>
+		      <entry>Cb<subscript>32</subscript></entry>
+		      <entry>Cb<subscript>33</subscript></entry>
+		    </row>
+		    <row><entry></entry></row>
+		    <row>
+		      <entry>start2&nbsp;+&nbsp;0:</entry>
+		      <entry>Cr<subscript>00</subscript></entry>
+		      <entry>Cr<subscript>01</subscript></entry>
+		      <entry>Cr<subscript>02</subscript></entry>
+		      <entry>Cr<subscript>03</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start2&nbsp;+&nbsp;4:</entry>
+		      <entry>Cr<subscript>10</subscript></entry>
+		      <entry>Cr<subscript>11</subscript></entry>
+		      <entry>Cr<subscript>12</subscript></entry>
+		      <entry>Cr<subscript>13</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start2&nbsp;+&nbsp;8:</entry>
+		      <entry>Cr<subscript>20</subscript></entry>
+		      <entry>Cr<subscript>21</subscript></entry>
+		      <entry>Cr<subscript>22</subscript></entry>
+		      <entry>Cr<subscript>23</subscript></entry>
+		    </row>
+		    <row>
+		      <entry>start2&nbsp;+&nbsp;12:</entry>
+		      <entry>Cr<subscript>30</subscript></entry>
+		      <entry>Cr<subscript>31</subscript></entry>
+		      <entry>Cr<subscript>32</subscript></entry>
+		      <entry>Cr<subscript>33</subscript></entry>
+		    </row>
+		  </tbody>
+		</tgroup>
+		</informaltable>
+	      </para>
+	  </formalpara>
+
+	  <formalpara>
+	    <title>Color Sample Location.</title>
+	    <para>
+		<informaltable frame="none">
+		<tgroup cols="7" align="center">
+		  <tbody valign="top">
+		    <row>
+		      <entry></entry>
+		      <entry>0</entry><entry></entry><entry>1</entry><entry></entry>
+		      <entry>2</entry><entry></entry><entry>3</entry>
+		    </row>
+		    <row>
+		      <entry>0</entry>
+		      <entry>YC</entry><entry></entry><entry>YC</entry><entry></entry>
+		      <entry>YC</entry><entry></entry><entry>YC</entry>
+		    </row>
+		    <row>
+		      <entry>1</entry>
+		      <entry>YC</entry><entry></entry><entry>YC</entry><entry></entry>
+		      <entry>YC</entry><entry></entry><entry>YC</entry>
+		    </row>
+		    <row>
+		      <entry>2</entry>
+		      <entry>YC</entry><entry></entry><entry>YC</entry><entry></entry>
+		      <entry>YC</entry><entry></entry><entry>YC</entry>
+		    </row>
+		    <row>
+		      <entry>3</entry>
+		      <entry>YC</entry><entry></entry><entry>YC</entry><entry></entry>
+		      <entry>YC</entry><entry></entry><entry>YC</entry>
+		    </row>
+		  </tbody>
+		</tgroup>
+		</informaltable>
+	      </para>
+	  </formalpara>
+	</example>
+      </refsect1>
+    </refentry>
diff --git a/Documentation/DocBook/media/v4l/pixfmt.xml b/Documentation/DocBook/media/v4l/pixfmt.xml
index 9e77ff353feb..2f02f9441443 100644
--- a/Documentation/DocBook/media/v4l/pixfmt.xml
+++ b/Documentation/DocBook/media/v4l/pixfmt.xml
@@ -1628,6 +1628,8 @@ information.</para>
     &sub-y41p;
     &sub-yuv420;
     &sub-yuv420m;
+    &sub-yuv422m;
+    &sub-yuv444m;
     &sub-yuv410;
     &sub-yuv422p;
     &sub-yuv411p;
diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c
index 8a018c6dd16a..14843090fd61 100644
--- a/drivers/media/v4l2-core/v4l2-ioctl.c
+++ b/drivers/media/v4l2-core/v4l2-ioctl.c
@@ -1191,6 +1191,10 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt)
 	case V4L2_PIX_FMT_NV12MT_16X16:	descr = "Y/CbCr 4:2:0 (16x16 MB, N-C)"; break;
 	case V4L2_PIX_FMT_YUV420M:	descr = "Planar YUV 4:2:0 (N-C)"; break;
 	case V4L2_PIX_FMT_YVU420M:	descr = "Planar YVU 4:2:0 (N-C)"; break;
+	case V4L2_PIX_FMT_YUV422M:	descr = "Planar YUV 4:2:2 (N-C)"; break;
+	case V4L2_PIX_FMT_YVU422M:	descr = "Planar YVU 4:2:2 (N-C)"; break;
+	case V4L2_PIX_FMT_YUV444M:	descr = "Planar YUV 4:4:4 (N-C)"; break;
+	case V4L2_PIX_FMT_YVU444M:	descr = "Planar YVU 4:4:4 (N-C)"; break;
 	case V4L2_PIX_FMT_SBGGR8:	descr = "8-bit Bayer BGBG/GRGR"; break;
 	case V4L2_PIX_FMT_SGBRG8:	descr = "8-bit Bayer GBGB/RGRG"; break;
 	case V4L2_PIX_FMT_SGRBG8:	descr = "8-bit Bayer GRGR/BGBG"; break;
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 14cd5ebfee6d..466458422385 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -546,6 +546,10 @@ struct v4l2_pix_format {
 /* three non contiguous planes - Y, Cb, Cr */
 #define V4L2_PIX_FMT_YUV420M v4l2_fourcc('Y', 'M', '1', '2') /* 12  YUV420 planar */
 #define V4L2_PIX_FMT_YVU420M v4l2_fourcc('Y', 'M', '2', '1') /* 12  YVU420 planar */
+#define V4L2_PIX_FMT_YUV422M v4l2_fourcc('Y', 'M', '1', '6') /* 16  YUV422 planar */
+#define V4L2_PIX_FMT_YVU422M v4l2_fourcc('Y', 'M', '6', '1') /* 16  YVU422 planar */
+#define V4L2_PIX_FMT_YUV444M v4l2_fourcc('Y', 'M', '2', '4') /* 24  YUV444 planar */
+#define V4L2_PIX_FMT_YVU444M v4l2_fourcc('Y', 'M', '4', '2') /* 24  YVU444 planar */
 
 /* Bayer formats - see http://www.siliconimaging.com/RGB%20Bayer.htm */
 #define V4L2_PIX_FMT_SBGGR8  v4l2_fourcc('B', 'A', '8', '1') /*  8  BGBG.. GRGR.. */
-- 
cgit v1.2.3


From 2125715635053d4207a756a35aa718f548824e58 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Tue, 16 Feb 2016 12:46:54 +0100
Subject: bridge: mdb: add support for more attributes and export timer

Currently mdb entries are exported directly as a structure inside
MDBA_MDB_ENTRY_INFO attribute, we can't really extend it without
breaking user-space. In order to export new mdb fields, I've converted
the MDBA_MDB_ENTRY_INFO into a nested attribute which starts like before
with struct br_mdb_entry (without header, as it's casted directly in
iproute2) and continues with MDBA_MDB_EATTR_ attributes. This way we
keep compatibility with older users and can export new data.
I've tested this with iproute2, both with and without support for the
added attribute and it works fine.
So basically we again have MDBA_MDB_ENTRY_INFO with struct br_mdb_entry
inside but it may contain also some additional MDBA_MDB_EATTR_ attributes
such as MDBA_MDB_EATTR_TIMER which can be parsed by user-space.

So the new structure is:
[MDBA_MDB] = {
     [MDBA_MDB_ENTRY] = {
         [MDBA_MDB_ENTRY_INFO]
         [MDBA_MDB_ENTRY_INFO] { <- Nested attribute
             struct br_mdb_entry <- nla_put_nohdr()
             [MDBA_MDB_ENTRY attributes] <- normal netlink attributes
         }
     }
}

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 13 ++++++++++++-
 net/bridge/br_mdb.c            | 16 +++++++++++++++-
 2 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index ec3547234998..0890b217580d 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -137,7 +137,10 @@ struct bridge_vlan_info {
 /* Bridge multicast database attributes
  * [MDBA_MDB] = {
  *     [MDBA_MDB_ENTRY] = {
- *         [MDBA_MDB_ENTRY_INFO]
+ *         [MDBA_MDB_ENTRY_INFO] {
+ *		struct br_mdb_entry
+ *		[MDBA_MDB_EATTR attributes]
+ *         }
  *     }
  * }
  * [MDBA_ROUTER] = {
@@ -166,6 +169,14 @@ enum {
 };
 #define MDBA_MDB_ENTRY_MAX (__MDBA_MDB_ENTRY_MAX - 1)
 
+/* per mdb entry additional attributes */
+enum {
+	MDBA_MDB_EATTR_UNSPEC,
+	MDBA_MDB_EATTR_TIMER,
+	__MDBA_MDB_EATTR_MAX
+};
+#define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1)
+
 enum {
 	MDBA_ROUTER_UNSPEC,
 	MDBA_ROUTER_PORT,
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index e66619171386..cf51b7bcb5d5 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -88,11 +88,13 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 			for (pp = &mp->ports;
 			     (p = rcu_dereference(*pp)) != NULL;
 			      pp = &p->next) {
+				struct nlattr *nest_ent;
 				struct br_mdb_entry e;
 
 				port = p->port;
 				if (!port)
 					continue;
+
 				memset(&e, 0, sizeof(e));
 				e.ifindex = port->dev->ifindex;
 				e.vid = p->addr.vid;
@@ -104,11 +106,23 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 					e.addr.u.ip6 = p->addr.u.ip6;
 #endif
 				e.addr.proto = p->addr.proto;
-				if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) {
+				nest_ent = nla_nest_start(skb,
+							  MDBA_MDB_ENTRY_INFO);
+				if (!nest_ent) {
+					nla_nest_cancel(skb, nest2);
+					err = -EMSGSIZE;
+					goto out;
+				}
+				if (nla_put_nohdr(skb, sizeof(e), &e) ||
+				    nla_put_u32(skb,
+						MDBA_MDB_EATTR_TIMER,
+						br_timer_value(&p->timer))) {
+					nla_nest_cancel(skb, nest_ent);
 					nla_nest_cancel(skb, nest2);
 					err = -EMSGSIZE;
 					goto out;
 				}
+				nla_nest_end(skb, nest_ent);
 			}
 			nla_nest_end(skb, nest2);
 		skip:
-- 
cgit v1.2.3


From ac2c7ad0e5d6030452c9af2fafd192e17fd04264 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@intel.com>
Date: Fri, 19 Feb 2016 09:24:01 -0500
Subject: net/ethtool: introduce a new ioctl for per queue setting

Introduce a new ioctl ETHTOOL_PERQUEUE for per queue parameters setting.
The following patches will enable some SUB_COMMANDs for per queue
setting.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Reviewed-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 17 +++++++++++++++++
 net/core/ethtool.c           | 27 +++++++++++++++++++++++++--
 2 files changed, 42 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 190aea0faaf4..f15ae02621a1 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1202,6 +1202,21 @@ enum ethtool_sfeatures_retval_bits {
 #define ETHTOOL_F_WISH          (1 << ETHTOOL_F_WISH__BIT)
 #define ETHTOOL_F_COMPAT        (1 << ETHTOOL_F_COMPAT__BIT)
 
+#define MAX_NUM_QUEUE		4096
+
+/**
+ * struct ethtool_per_queue_op - apply sub command to the queues in mask.
+ * @cmd: ETHTOOL_PERQUEUE
+ * @sub_command: the sub command which apply to each queues
+ * @queue_mask: Bitmap of the queues which sub command apply to
+ * @data: A complete command structure following for each of the queues addressed
+ */
+struct ethtool_per_queue_op {
+	__u32	cmd;
+	__u32	sub_command;
+	__u32	queue_mask[DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
+	char	data[];
+};
 
 /* CMDs currently supported */
 #define ETHTOOL_GSET		0x00000001 /* Get settings. */
@@ -1285,6 +1300,8 @@ enum ethtool_sfeatures_retval_bits {
 #define ETHTOOL_STUNABLE	0x00000049 /* Set tunable configuration */
 #define ETHTOOL_GPHYSTATS	0x0000004a /* get PHY-specific statistics */
 
+#define ETHTOOL_PERQUEUE	0x0000004b /* Set per queue options */
+
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
 #define SPARC_ETH_SSET		ETHTOOL_SSET
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index c2d3118b1395..d640ecf71e74 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1888,13 +1888,27 @@ out:
 	return ret;
 }
 
+static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
+{
+	struct ethtool_per_queue_op per_queue_opt;
+
+	if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt)))
+		return -EFAULT;
+
+	switch (per_queue_opt.sub_command) {
+
+	default:
+		return -EOPNOTSUPP;
+	};
+}
+
 /* The main entry point in this file.  Called from net/core/dev_ioctl.c */
 
 int dev_ethtool(struct net *net, struct ifreq *ifr)
 {
 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
 	void __user *useraddr = ifr->ifr_data;
-	u32 ethcmd;
+	u32 ethcmd, sub_cmd;
 	int rc;
 	netdev_features_t old_features;
 
@@ -1904,8 +1918,14 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
 		return -EFAULT;
 
+	if (ethcmd == ETHTOOL_PERQUEUE) {
+		if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)))
+			return -EFAULT;
+	} else {
+		sub_cmd = ethcmd;
+	}
 	/* Allow some commands to be done by anyone */
-	switch (ethcmd) {
+	switch (sub_cmd) {
 	case ETHTOOL_GSET:
 	case ETHTOOL_GDRVINFO:
 	case ETHTOOL_GMSGLVL:
@@ -2135,6 +2155,9 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GPHYSTATS:
 		rc = ethtool_get_phy_stats(dev, useraddr);
 		break;
+	case ETHTOOL_PERQUEUE:
+		rc = ethtool_set_per_queue(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From d5a3b1f691865be576c2bffa708549b8cdccda19 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Wed, 17 Feb 2016 19:58:58 -0800
Subject: bpf: introduce BPF_MAP_TYPE_STACK_TRACE

add new map type to store stack traces and corresponding helper
bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id
@ctx: struct pt_regs*
@map: pointer to stack_trace map
@flags: bits 0-7 - numer of stack frames to skip
        bit 8 - collect user stack instead of kernel
        bit 9 - compare stacks by hash only
        bit 10 - if two different stacks hash into the same stackid
                 discard old
        other bits - reserved
Return: >= 0 stackid on success or negative error

stackid is a 32-bit integer handle that can be further combined with
other data (including other stackid) and used as a key into maps.

Userspace will access stackmap using standard lookup/delete syscall commands to
retrieve full stack trace for given stackid.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   1 +
 include/uapi/linux/bpf.h |  21 +++++
 kernel/bpf/Makefile      |   3 +
 kernel/bpf/stackmap.c    | 237 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/verifier.c    |   6 +-
 kernel/trace/bpf_trace.c |   2 +
 6 files changed, 269 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/stackmap.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 90ee6ab24bc5..0cadbb7456c0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -237,6 +237,7 @@ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto;
 extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
+extern const struct bpf_func_proto bpf_get_stackid_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2ee0fde1bf96..d3e77da8e9e8 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -83,6 +83,7 @@ enum bpf_map_type {
 	BPF_MAP_TYPE_PERF_EVENT_ARRAY,
 	BPF_MAP_TYPE_PERCPU_HASH,
 	BPF_MAP_TYPE_PERCPU_ARRAY,
+	BPF_MAP_TYPE_STACK_TRACE,
 };
 
 enum bpf_prog_type {
@@ -272,6 +273,20 @@ enum bpf_func_id {
 	 */
 	BPF_FUNC_perf_event_output,
 	BPF_FUNC_skb_load_bytes,
+
+	/**
+	 * bpf_get_stackid(ctx, map, flags) - walk user or kernel stack and return id
+	 * @ctx: struct pt_regs*
+	 * @map: pointer to stack_trace map
+	 * @flags: bits 0-7 - numer of stack frames to skip
+	 *         bit 8 - collect user stack instead of kernel
+	 *         bit 9 - compare stacks by hash only
+	 *         bit 10 - if two different stacks hash into the same stackid
+	 *                  discard old
+	 *         other bits - reserved
+	 * Return: >= 0 stackid on success or negative error
+	 */
+	BPF_FUNC_get_stackid,
 	__BPF_FUNC_MAX_ID,
 };
 
@@ -294,6 +309,12 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
+/* BPF_FUNC_get_stackid flags. */
+#define BPF_F_SKIP_FIELD_MASK		0xffULL
+#define BPF_F_USER_STACK		(1ULL << 8)
+#define BPF_F_FAST_STACK_CMP		(1ULL << 9)
+#define BPF_F_REUSE_STACKID		(1ULL << 10)
+
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
  */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 13272582eee0..8a932d079c24 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,3 +2,6 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
+ifeq ($(CONFIG_PERF_EVENTS),y)
+obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
+endif
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
new file mode 100644
index 000000000000..8a60ee14a977
--- /dev/null
+++ b/kernel/bpf/stackmap.c
@@ -0,0 +1,237 @@
+/* Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/jhash.h>
+#include <linux/filter.h>
+#include <linux/vmalloc.h>
+#include <linux/stacktrace.h>
+#include <linux/perf_event.h>
+
+struct stack_map_bucket {
+	struct rcu_head rcu;
+	u32 hash;
+	u32 nr;
+	u64 ip[];
+};
+
+struct bpf_stack_map {
+	struct bpf_map map;
+	u32 n_buckets;
+	struct stack_map_bucket __rcu *buckets[];
+};
+
+/* Called from syscall */
+static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
+{
+	u32 value_size = attr->value_size;
+	struct bpf_stack_map *smap;
+	u64 cost, n_buckets;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    value_size < 8 || value_size % 8 ||
+	    value_size / 8 > PERF_MAX_STACK_DEPTH)
+		return ERR_PTR(-EINVAL);
+
+	/* hash table size must be power of 2 */
+	n_buckets = roundup_pow_of_two(attr->max_entries);
+
+	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
+	if (cost >= U32_MAX - PAGE_SIZE)
+		return ERR_PTR(-E2BIG);
+
+	smap = kzalloc(cost, GFP_USER | __GFP_NOWARN);
+	if (!smap) {
+		smap = vzalloc(cost);
+		if (!smap)
+			return ERR_PTR(-ENOMEM);
+	}
+
+	err = -E2BIG;
+	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_smap;
+
+	smap->map.map_type = attr->map_type;
+	smap->map.key_size = attr->key_size;
+	smap->map.value_size = value_size;
+	smap->map.max_entries = attr->max_entries;
+	smap->n_buckets = n_buckets;
+	smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	err = get_callchain_buffers();
+	if (err)
+		goto free_smap;
+
+	return &smap->map;
+
+free_smap:
+	kvfree(smap);
+	return ERR_PTR(err);
+}
+
+static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
+{
+	struct pt_regs *regs = (struct pt_regs *) (long) r1;
+	struct bpf_map *map = (struct bpf_map *) (long) r2;
+	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+	struct perf_callchain_entry *trace;
+	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
+	u32 max_depth = map->value_size / 8;
+	/* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */
+	u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+	u32 hash, id, trace_nr, trace_len;
+	bool user = flags & BPF_F_USER_STACK;
+	bool kernel = !user;
+	u64 *ips;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+		return -EINVAL;
+
+	trace = get_perf_callchain(regs, init_nr, kernel, user, false, false);
+
+	if (unlikely(!trace))
+		/* couldn't fetch the stack trace */
+		return -EFAULT;
+
+	/* get_perf_callchain() guarantees that trace->nr >= init_nr
+	 * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth
+	 */
+	trace_nr = trace->nr - init_nr;
+
+	if (trace_nr <= skip)
+		/* skipping more than usable stack trace */
+		return -EFAULT;
+
+	trace_nr -= skip;
+	trace_len = trace_nr * sizeof(u64);
+	ips = trace->ip + skip + init_nr;
+	hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
+	id = hash & (smap->n_buckets - 1);
+	bucket = rcu_dereference(smap->buckets[id]);
+
+	if (bucket && bucket->hash == hash) {
+		if (flags & BPF_F_FAST_STACK_CMP)
+			return id;
+		if (bucket->nr == trace_nr &&
+		    memcmp(bucket->ip, ips, trace_len) == 0)
+			return id;
+	}
+
+	/* this call stack is not in the map, try to add it */
+	if (bucket && !(flags & BPF_F_REUSE_STACKID))
+		return -EEXIST;
+
+	new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size,
+			     GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!new_bucket))
+		return -ENOMEM;
+
+	memcpy(new_bucket->ip, ips, trace_len);
+	memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len);
+	new_bucket->hash = hash;
+	new_bucket->nr = trace_nr;
+
+	old_bucket = xchg(&smap->buckets[id], new_bucket);
+	if (old_bucket)
+		kfree_rcu(old_bucket, rcu);
+	return id;
+}
+
+const struct bpf_func_proto bpf_get_stackid_proto = {
+	.func		= bpf_get_stackid,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
+/* Called from syscall or from eBPF program */
+static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+	struct stack_map_bucket *bucket;
+	u32 id = *(u32 *)key;
+
+	if (unlikely(id >= smap->n_buckets))
+		return NULL;
+	bucket = rcu_dereference(smap->buckets[id]);
+	return bucket ? bucket->ip : NULL;
+}
+
+static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	return -EINVAL;
+}
+
+static int stack_map_update_elem(struct bpf_map *map, void *key, void *value,
+				 u64 map_flags)
+{
+	return -EINVAL;
+}
+
+/* Called from syscall or from eBPF program */
+static int stack_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+	struct stack_map_bucket *old_bucket;
+	u32 id = *(u32 *)key;
+
+	if (unlikely(id >= smap->n_buckets))
+		return -E2BIG;
+
+	old_bucket = xchg(&smap->buckets[id], NULL);
+	if (old_bucket) {
+		kfree_rcu(old_bucket, rcu);
+		return 0;
+	} else {
+		return -ENOENT;
+	}
+}
+
+/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
+static void stack_map_free(struct bpf_map *map)
+{
+	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
+	int i;
+
+	synchronize_rcu();
+
+	for (i = 0; i < smap->n_buckets; i++)
+		if (smap->buckets[i])
+			kfree_rcu(smap->buckets[i], rcu);
+	kvfree(smap);
+	put_callchain_buffers();
+}
+
+static const struct bpf_map_ops stack_map_ops = {
+	.map_alloc = stack_map_alloc,
+	.map_free = stack_map_free,
+	.map_get_next_key = stack_map_get_next_key,
+	.map_lookup_elem = stack_map_lookup_elem,
+	.map_update_elem = stack_map_update_elem,
+	.map_delete_elem = stack_map_delete_elem,
+};
+
+static struct bpf_map_type_list stack_map_type __read_mostly = {
+	.ops = &stack_map_ops,
+	.type = BPF_MAP_TYPE_STACK_TRACE,
+};
+
+static int __init register_stack_map(void)
+{
+	bpf_register_map_type(&stack_map_type);
+	return 0;
+}
+late_initcall(register_stack_map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d1d3e8f57de9..42ba4ccc020b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -246,6 +246,7 @@ static const struct {
 	{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
 	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
 	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
+	{BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid},
 };
 
 static void print_verifier_state(struct verifier_env *env)
@@ -911,8 +912,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		 * don't allow any other map type to be passed into
 		 * the special func;
 		 */
-		if (bool_func && bool_map != bool_func)
+		if (bool_func && bool_map != bool_func) {
+			verbose("cannot pass map_type %d into func %d\n",
+				map->map_type, func_id);
 			return -EINVAL;
+		}
 	}
 
 	return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 326a75e884db..4b8caa392b86 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -299,6 +299,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_perf_event_read_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_perf_event_output_proto;
+	case BPF_FUNC_get_stackid:
+		return &bpf_get_stackid_proto;
 	default:
 		return NULL;
 	}
-- 
cgit v1.2.3


From 7d672345ed295b1356a5d9f7111da1d1d7d65867 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 19 Feb 2016 23:05:23 +0100
Subject: bpf: add generic bpf_csum_diff helper

For L4 checksums, we currently have bpf_l4_csum_replace() helper. It's
currently limited to handle 2 and 4 byte changes in a header and feeds the
from/to into inet_proto_csum_replace{2,4}() helpers of the kernel. When
working with IPv6, for example, this makes it rather cumbersome to deal
with, similarly when editing larger parts of a header.

Instead, extend the API in a more generic way: For bpf_l4_csum_replace(),
add a case for header field mask of 0 to change the checksum at a given
offset through inet_proto_csum_replace_by_diff(), and provide a helper
bpf_csum_diff() that can generically calculate a from/to diff for arbitrary
amounts of data.

This can be used in multiple ways: for the bpf_l4_csum_replace() only
part, this even provides us with the option to insert precalculated diffs
from user space f.e. from a map, or from bpf_csum_diff() during runtime.

bpf_csum_diff() has a optional from/to stack buffer input, so we can
calculate a diff by using a scratchbuffer for scenarios where we're
inserting (from is NULL), removing (to is NULL) or diffing (from/to buffers
don't need to be of equal size) data. Also, bpf_csum_diff() allows to
feed a previous csum into csum_partial(), so the function can also be
cascaded.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 11 ++++++++++
 net/core/filter.c        | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d3e77da8e9e8..48d0a6c54609 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -287,6 +287,17 @@ enum bpf_func_id {
 	 * Return: >= 0 stackid on success or negative error
 	 */
 	BPF_FUNC_get_stackid,
+
+	/**
+	 * bpf_csum_diff(from, from_size, to, to_size, seed) - calculate csum diff
+	 * @from: raw from buffer
+	 * @from_size: length of from buffer
+	 * @to: raw to buffer
+	 * @to_size: length of to buffer
+	 * @seed: optional seed
+	 * Return: csum result
+	 */
+	BPF_FUNC_csum_diff,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 2a6e9562f1ab..bf504f8fbe15 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1491,6 +1491,12 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EFAULT;
 
 	switch (flags & BPF_F_HDR_FIELD_MASK) {
+	case 0:
+		if (unlikely(from != 0))
+			return -EINVAL;
+
+		inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
+		break;
 	case 2:
 		inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
 		break;
@@ -1519,6 +1525,51 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = {
 	.arg5_type	= ARG_ANYTHING,
 };
 
+struct bpf_csum_scratchpad {
+	__be32 diff[128];
+};
+
+static DEFINE_PER_CPU(struct bpf_csum_scratchpad, bpf_csum_sp);
+
+static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
+{
+	struct bpf_csum_scratchpad *sp = this_cpu_ptr(&bpf_csum_sp);
+	u64 diff_size = from_size + to_size;
+	__be32 *from = (__be32 *) (long) r1;
+	__be32 *to   = (__be32 *) (long) r3;
+	int i, j = 0;
+
+	/* This is quite flexible, some examples:
+	 *
+	 * from_size == 0, to_size > 0,  seed := csum --> pushing data
+	 * from_size > 0,  to_size == 0, seed := csum --> pulling data
+	 * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
+	 *
+	 * Even for diffing, from_size and to_size don't need to be equal.
+	 */
+	if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
+		     diff_size > sizeof(sp->diff)))
+		return -EINVAL;
+
+	for (i = 0; i < from_size / sizeof(__be32); i++, j++)
+		sp->diff[j] = ~from[i];
+	for (i = 0; i <   to_size / sizeof(__be32); i++, j++)
+		sp->diff[j] = to[i];
+
+	return csum_partial(sp->diff, diff_size, seed);
+}
+
+const struct bpf_func_proto bpf_csum_diff_proto = {
+	.func		= bpf_csum_diff,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_STACK,
+	.arg2_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
+	.arg3_type	= ARG_PTR_TO_STACK,
+	.arg4_type	= ARG_CONST_STACK_SIZE_OR_ZERO,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
@@ -1849,6 +1900,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 		return &bpf_skb_store_bytes_proto;
 	case BPF_FUNC_skb_load_bytes:
 		return &bpf_skb_load_bytes_proto;
+	case BPF_FUNC_csum_diff:
+		return &bpf_csum_diff_proto;
 	case BPF_FUNC_l3_csum_replace:
 		return &bpf_l3_csum_replace_proto;
 	case BPF_FUNC_l4_csum_replace:
-- 
cgit v1.2.3


From 2f72959a9c1260ade234f353ccca91118151af66 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 19 Feb 2016 23:05:26 +0100
Subject: bpf: fix csum update in bpf_l4_csum_replace helper for udp

When using this helper for updating UDP checksums, we need to extend
this in order to write CSUM_MANGLED_0 for csum computations that result
into 0 as sum. Reason we need this is because packets with a checksum
could otherwise become incorrectly marked as a packet without a checksum.
Likewise, if the user indicates BPF_F_MARK_MANGLED_0, then we should
not turn packets without a checksum into ones with a checksum.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 48d0a6c54609..6496f98d3d68 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -313,6 +313,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_l4_csum_replace flags. */
 #define BPF_F_PSEUDO_HDR		(1ULL << 4)
+#define BPF_F_MARK_MANGLED_0		(1ULL << 5)
 
 /* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */
 #define BPF_F_INGRESS			(1ULL << 0)
diff --git a/net/core/filter.c b/net/core/filter.c
index f031b82128f3..8a0b8c3eb189 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1477,10 +1477,12 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 {
 	struct sk_buff *skb = (struct sk_buff *) (long) r1;
 	bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
+	bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
 	int offset = (int) r2;
 	__sum16 sum, *ptr;
 
-	if (unlikely(flags & ~(BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
+	if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
+			       BPF_F_HDR_FIELD_MASK)))
 		return -EINVAL;
 	if (unlikely((u32) offset > 0xffff))
 		return -EFAULT;
@@ -1490,6 +1492,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 	ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
 	if (unlikely(!ptr))
 		return -EFAULT;
+	if (is_mmzero && !*ptr)
+		return 0;
 
 	switch (flags & BPF_F_HDR_FIELD_MASK) {
 	case 0:
@@ -1508,6 +1512,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
 		return -EINVAL;
 	}
 
+	if (is_mmzero && !*ptr)
+		*ptr = CSUM_MANGLED_0;
 	if (ptr == &sum)
 		/* skb_store_bits guaranteed to not return -EFAULT here */
 		skb_store_bits(skb, offset, ptr, sizeof(sum));
-- 
cgit v1.2.3


From c84982adb23bcf3b99b79ca33527cd2625fbe279 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 22 Feb 2016 16:02:32 -0700
Subject: vfio: Define capability chains

We have a few cases where we need to extend the data returned from the
INFO ioctls in VFIO.  For instance we already have devices exposed
through vfio-pci where VFIO_DEVICE_GET_REGION_INFO reports the region
as mmap-capable, but really only supports sparse mmaps, avoiding the
MSI-X table.  If we wanted to provide in-kernel emulation or extended
functionality for devices, we'd also want the ability to tell the
user not to mmap various regions, rather than forcing them to figure
it out on their own.

Another example is VFIO_IOMMU_GET_INFO.  We'd really like to expose
the actual IOVA capabilities of the IOMMU rather than letting the
user assume the address space they have available to them.  We could
add IOVA base and size fields to struct vfio_iommu_type1_info, but
what if we have multiple IOVA ranges.  For instance x86 uses a range
of addresses at 0xfee00000 for MSI vectors.  These typically are not
available for standard DMA IOVA mappings and splits our available IOVA
space into two ranges.  POWER systems have both an IOVA window below
4G as well as dynamic data window which they can use to remap all of
guest memory.

Representing variable sized arrays within a fixed structure makes it
very difficult to parse, we'd therefore like to put this data beyond
fixed fields within the data structures.  One way to do this is to
emulate capabilities in PCI configuration space.  A new flag indciates
whether capabilties are supported and a new fixed field reports the
offset of the first entry.  Users can then walk the chain to find
capabilities, adding capabilities does not require additional fields
in the fixed structure, and parsing variable sized data becomes
trivial.

This patch outlines the theory and base header structure, which
should be shared by all future users.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/vfio.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 7d7a4c6f2090..d508adf17610 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -59,6 +59,33 @@
 #define VFIO_TYPE	(';')
 #define VFIO_BASE	100
 
+/*
+ * For extension of INFO ioctls, VFIO makes use of a capability chain
+ * designed after PCI/e capabilities.  A flag bit indicates whether
+ * this capability chain is supported and a field defined in the fixed
+ * structure defines the offset of the first capability in the chain.
+ * This field is only valid when the corresponding bit in the flags
+ * bitmap is set.  This offset field is relative to the start of the
+ * INFO buffer, as is the next field within each capability header.
+ * The id within the header is a shared address space per INFO ioctl,
+ * while the version field is specific to the capability id.  The
+ * contents following the header are specific to the capability id.
+ */
+struct vfio_info_cap_header {
+	__u16	id;		/* Identifies capability */
+	__u16	version;	/* Version specific to the capability ID */
+	__u32	next;		/* Offset of next capability */
+};
+
+/*
+ * Callers of INFO ioctls passing insufficiently sized buffers will see
+ * the capability chain flag bit set, a zero value for the first capability
+ * offset (if available within the provided argsz), and argsz will be
+ * updated to report the necessary buffer size.  For compatibility, the
+ * INFO ioctl will not report error in this case, but the capability chain
+ * will not be available.
+ */
+
 /* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */
 
 /**
-- 
cgit v1.2.3


From ff63eb638d63b95e489f976428f1df01391e15e4 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 22 Feb 2016 16:02:35 -0700
Subject: vfio: Define sparse mmap capability for regions

We can't always support mmap across an entire device region, for
example we deny mmaps covering the MSI-X table of PCI devices, but
we don't really have a way to report it.  We expect the user to
implicitly know this restriction.  We also can't split the region
because vfio-pci defines an API with fixed region index to BAR
number mapping.  We therefore define a new capability which lists
areas within the region that may be mmap'd.  In addition to the
MSI-X case, this potentially enables in-kernel emulation and
extensions to devices.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/vfio.h | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index d508adf17610..fde7b1e60948 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -221,13 +221,37 @@ struct vfio_region_info {
 #define VFIO_REGION_INFO_FLAG_READ	(1 << 0) /* Region supports read */
 #define VFIO_REGION_INFO_FLAG_WRITE	(1 << 1) /* Region supports write */
 #define VFIO_REGION_INFO_FLAG_MMAP	(1 << 2) /* Region supports mmap */
+#define VFIO_REGION_INFO_FLAG_CAPS	(1 << 3) /* Info supports caps */
 	__u32	index;		/* Region index */
-	__u32	resv;		/* Reserved for alignment */
+	__u32	cap_offset;	/* Offset within info struct of first cap */
 	__u64	size;		/* Region size (bytes) */
 	__u64	offset;		/* Region offset from start of device fd */
 };
 #define VFIO_DEVICE_GET_REGION_INFO	_IO(VFIO_TYPE, VFIO_BASE + 8)
 
+/*
+ * The sparse mmap capability allows finer granularity of specifying areas
+ * within a region with mmap support.  When specified, the user should only
+ * mmap the offset ranges specified by the areas array.  mmaps outside of the
+ * areas specified may fail (such as the range covering a PCI MSI-X table) or
+ * may result in improper device behavior.
+ *
+ * The structures below define version 1 of this capability.
+ */
+#define VFIO_REGION_INFO_CAP_SPARSE_MMAP	1
+
+struct vfio_region_sparse_mmap_area {
+	__u64	offset;	/* Offset of mmap'able area within region */
+	__u64	size;	/* Size of mmap'able area */
+};
+
+struct vfio_region_info_cap_sparse_mmap {
+	struct vfio_info_cap_header header;
+	__u32	nr_areas;
+	__u32	reserved;
+	struct vfio_region_sparse_mmap_area areas[];
+};
+
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
  *				    struct vfio_irq_info)
-- 
cgit v1.2.3


From c7bb4cb40f89224dc55755178343728e30dd583a Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 22 Feb 2016 16:02:38 -0700
Subject: vfio: Define device specific region type capability

To this point vfio has only provided an interface to the user that
allows them to determine the number of regions and specifics about
each region.  What the region represents is left to the vfio bus
driver.  vfio-pci chooses to use fixed indexes for fixed resources,
index 0 is BAR0, 1 is BAR1,... 7 is config space, etc.  This works
pretty well since all PCI devices have these regions, even if they
don't necessarily populate all of them.  Then we start to add things
like VGA, which only certain device even support.  We added this the
same way, but now we've wasted a region index, and due to our offset
implementation the corresponding address space, for all devices.

Rather than continuing that process, let's try to make regions self
describing by including a capability that defines their type.  For
vfio-pci we'll make the current VFIO_PCI_NUM_REGIONS fixed, defining
the end of the static indexes and the beginning of self describing
regions.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/uapi/linux/vfio.h | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index fde7b1e60948..1c37a0e500c6 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -252,6 +252,34 @@ struct vfio_region_info_cap_sparse_mmap {
 	struct vfio_region_sparse_mmap_area areas[];
 };
 
+/*
+ * The device specific type capability allows regions unique to a specific
+ * device or class of devices to be exposed.  This helps solve the problem for
+ * vfio bus drivers of defining which region indexes correspond to which region
+ * on the device, without needing to resort to static indexes, as done by
+ * vfio-pci.  For instance, if we were to go back in time, we might remove
+ * VFIO_PCI_VGA_REGION_INDEX and let vfio-pci simply define that all indexes
+ * greater than or equal to VFIO_PCI_NUM_REGIONS are device specific and we'd
+ * make a "VGA" device specific type to describe the VGA access space.  This
+ * means that non-VGA devices wouldn't need to waste this index, and thus the
+ * address space associated with it due to implementation of device file
+ * descriptor offsets in vfio-pci.
+ *
+ * The current implementation is now part of the user ABI, so we can't use this
+ * for VGA, but there are other upcoming use cases, such as opregions for Intel
+ * IGD devices and framebuffers for vGPU devices.  We missed VGA, but we'll
+ * use this for future additions.
+ *
+ * The structure below defines version 1 of this capability.
+ */
+#define VFIO_REGION_INFO_CAP_TYPE	2
+
+struct vfio_region_info_cap_type {
+	struct vfio_info_cap_header header;
+	__u32 type;	/* global per bus driver */
+	__u32 subtype;	/* type specific */
+};
+
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
  *				    struct vfio_irq_info)
@@ -387,7 +415,8 @@ enum {
 	 * between described ranges are unimplemented.
 	 */
 	VFIO_PCI_VGA_REGION_INDEX,
-	VFIO_PCI_NUM_REGIONS
+	VFIO_PCI_NUM_REGIONS = 9 /* Fixed user ABI, region indexes >=9 use */
+				 /* device specific cap to define content. */
 };
 
 enum {
-- 
cgit v1.2.3


From 5846ff54e87d8bab4f1e330af0b5407747a0a57e Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 22 Feb 2016 16:02:43 -0700
Subject: vfio/pci: Intel IGD OpRegion support

This is the first consumer of vfio device specific resource support,
providing read-only access to the OpRegion for Intel graphics devices.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/Kconfig            |   4 ++
 drivers/vfio/pci/Makefile           |   1 +
 drivers/vfio/pci/vfio_pci.c         |   7 +++
 drivers/vfio/pci/vfio_pci_igd.c     | 111 ++++++++++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci_private.h |   8 +++
 include/uapi/linux/vfio.h           |   5 ++
 6 files changed, 136 insertions(+)
 create mode 100644 drivers/vfio/pci/vfio_pci_igd.c

(limited to 'include/uapi/linux')

diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 02912f180c6d..24ee2605b9f0 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -26,3 +26,7 @@ config VFIO_PCI_MMAP
 config VFIO_PCI_INTX
 	depends on VFIO_PCI
 	def_bool y if !S390
+
+config VFIO_PCI_IGD
+	depends on VFIO_PCI
+	def_bool y if X86
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 131079255fd9..76d8ec058edd 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,4 +1,5 @@
 
 vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
+vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o
 
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 813a2e67aa0c..cb2624db37d8 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -169,6 +169,13 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
 	if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev))
 		vdev->has_vga = true;
 
+
+	if (vfio_pci_is_vga(pdev) && pdev->vendor == PCI_VENDOR_ID_INTEL) {
+		if (vfio_pci_igd_opregion_init(vdev) == 0)
+			dev_info(&pdev->dev,
+				 "Intel IGD OpRegion support enabled\n");
+	}
+
 	return 0;
 }
 
diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
new file mode 100644
index 000000000000..3b6a6f7b367b
--- /dev/null
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -0,0 +1,111 @@
+/*
+ * VFIO PCI Intel Graphics support
+ *
+ * Copyright (C) 2016 Red Hat, Inc.  All rights reserved.
+ *	Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Register a device specific region through which to provide read-only
+ * access to the Intel IGD opregion.  The register defining the opregion
+ * address is also virtualized to prevent user modification.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/uaccess.h>
+#include <linux/vfio.h>
+
+#include "vfio_pci_private.h"
+
+#define OPREGION_SIGNATURE	"IntelGraphicsMem"
+#define OPREGION_SIZE		(8 * 1024)
+#define OPREGION_PCI_ADDR	0xfc
+
+static size_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf,
+			      size_t count, loff_t *ppos, bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	void *base = vdev->region[i].data;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+	if (pos >= vdev->region[i].size || iswrite)
+		return -EINVAL;
+
+	count = min(count, (size_t)(vdev->region[i].size - pos));
+
+	if (copy_to_user(buf, base + pos, count))
+		return -EFAULT;
+
+	*ppos += count;
+
+	return count;
+}
+
+static void vfio_pci_igd_release(struct vfio_pci_device *vdev,
+				 struct vfio_pci_region *region)
+{
+	memunmap(region->data);
+}
+
+static const struct vfio_pci_regops vfio_pci_igd_regops = {
+	.rw		= vfio_pci_igd_rw,
+	.release	= vfio_pci_igd_release,
+};
+
+int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
+{
+	__le32 *dwordp = (__le32 *)(vdev->vconfig + OPREGION_PCI_ADDR);
+	u32 addr, size;
+	void *base;
+	int ret;
+
+	ret = pci_read_config_dword(vdev->pdev, OPREGION_PCI_ADDR, &addr);
+	if (ret)
+		return ret;
+
+	if (!addr || !(~addr))
+		return -ENODEV;
+
+	base = memremap(addr, OPREGION_SIZE, MEMREMAP_WB);
+	if (!base)
+		return -ENOMEM;
+
+	if (memcmp(base, OPREGION_SIGNATURE, 16)) {
+		memunmap(base);
+		return -EINVAL;
+	}
+
+	size = le32_to_cpu(*(__le32 *)(base + 16));
+	if (!size) {
+		memunmap(base);
+		return -EINVAL;
+	}
+
+	size *= 1024; /* In KB */
+
+	if (size != OPREGION_SIZE) {
+		memunmap(base);
+		base = memremap(addr, size, MEMREMAP_WB);
+		if (!base)
+			return -ENOMEM;
+	}
+
+	ret = vfio_pci_register_dev_region(vdev,
+		PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+		VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION,
+		&vfio_pci_igd_regops, size, VFIO_REGION_INFO_FLAG_READ, base);
+	if (ret) {
+		memunmap(base);
+		return ret;
+	}
+
+	/* Fill vconfig with the hw value and virtualize register */
+	*dwordp = cpu_to_le32(addr);
+	memset(vdev->pci_config_map + OPREGION_PCI_ADDR,
+	       PCI_CAP_ID_INVALID_VIRT, 4);
+
+	return ret;
+}
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index b1e403235feb..19f7699ac699 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -122,4 +122,12 @@ extern int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
 					unsigned int type, unsigned int subtype,
 					const struct vfio_pci_regops *ops,
 					size_t size, u32 flags, void *data);
+#ifdef CONFIG_VFIO_PCI_IGD
+extern int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev);
+#else
+static inline int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
+{
+	return -ENODEV;
+}
+#endif
 #endif /* VFIO_PCI_PRIVATE_H */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 1c37a0e500c6..e622982dbc53 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -280,6 +280,11 @@ struct vfio_region_info_cap_type {
 	__u32 subtype;	/* type specific */
 };
 
+#define VFIO_REGION_TYPE_PCI_VENDOR_TYPE	(1 << 31)
+#define VFIO_REGION_TYPE_PCI_VENDOR_MASK	(0xffff)
+
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION	(1)
+
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
  *				    struct vfio_irq_info)
-- 
cgit v1.2.3


From f572a960a15e8bb56599f6d2358a9c18f0808e91 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 22 Feb 2016 16:02:45 -0700
Subject: vfio/pci: Intel IGD host and LCP bridge config space access

Provide read-only access to PCI config space of the PCI host bridge
and LPC bridge through device specific regions.  This may be used to
configure a VM with matching register contents to satisfy driver
requirements.  Providing this through the vfio file descriptor removes
an additional userspace requirement for access through pci-sysfs and
removes the CAP_SYS_ADMIN requirement that doesn't appear to apply to
the specific devices we're accessing.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci.c         |  15 +++-
 drivers/vfio/pci/vfio_pci_igd.c     | 171 +++++++++++++++++++++++++++++++++++-
 drivers/vfio/pci/vfio_pci_private.h |   4 +-
 include/uapi/linux/vfio.h           |   3 +
 4 files changed, 186 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index cb2624db37d8..74a375297494 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -111,6 +111,7 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
 }
 
 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev);
+static void vfio_pci_disable(struct vfio_pci_device *vdev);
 
 static int vfio_pci_enable(struct vfio_pci_device *vdev)
 {
@@ -170,10 +171,16 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev)
 		vdev->has_vga = true;
 
 
-	if (vfio_pci_is_vga(pdev) && pdev->vendor == PCI_VENDOR_ID_INTEL) {
-		if (vfio_pci_igd_opregion_init(vdev) == 0)
-			dev_info(&pdev->dev,
-				 "Intel IGD OpRegion support enabled\n");
+	if (vfio_pci_is_vga(pdev) &&
+	    pdev->vendor == PCI_VENDOR_ID_INTEL &&
+	    IS_ENABLED(CONFIG_VFIO_PCI_IGD)) {
+		ret = vfio_pci_igd_init(vdev);
+		if (ret) {
+			dev_warn(&vdev->pdev->dev,
+				 "Failed to setup Intel IGD regions\n");
+			vfio_pci_disable(vdev);
+			return ret;
+		}
 	}
 
 	return 0;
diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
index 3b6a6f7b367b..6394b168ef29 100644
--- a/drivers/vfio/pci/vfio_pci_igd.c
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -55,7 +55,7 @@ static const struct vfio_pci_regops vfio_pci_igd_regops = {
 	.release	= vfio_pci_igd_release,
 };
 
-int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
+static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
 {
 	__le32 *dwordp = (__le32 *)(vdev->vconfig + OPREGION_PCI_ADDR);
 	u32 addr, size;
@@ -109,3 +109,172 @@ int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
 
 	return ret;
 }
+
+static size_t vfio_pci_igd_cfg_rw(struct vfio_pci_device *vdev,
+				  char __user *buf, size_t count, loff_t *ppos,
+				  bool iswrite)
+{
+	unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS;
+	struct pci_dev *pdev = vdev->region[i].data;
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	size_t size;
+	int ret;
+
+	if (pos >= vdev->region[i].size || iswrite)
+		return -EINVAL;
+
+	size = count = min(count, (size_t)(vdev->region[i].size - pos));
+
+	if ((pos & 1) && size) {
+		u8 val;
+
+		ret = pci_user_read_config_byte(pdev, pos, &val);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		if (copy_to_user(buf + count - size, &val, 1))
+			return -EFAULT;
+
+		pos++;
+		size--;
+	}
+
+	if ((pos & 3) && size > 2) {
+		u16 val;
+
+		ret = pci_user_read_config_word(pdev, pos, &val);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		val = cpu_to_le16(val);
+		if (copy_to_user(buf + count - size, &val, 2))
+			return -EFAULT;
+
+		pos += 2;
+		size -= 2;
+	}
+
+	while (size > 3) {
+		u32 val;
+
+		ret = pci_user_read_config_dword(pdev, pos, &val);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		val = cpu_to_le32(val);
+		if (copy_to_user(buf + count - size, &val, 4))
+			return -EFAULT;
+
+		pos += 4;
+		size -= 4;
+	}
+
+	while (size >= 2) {
+		u16 val;
+
+		ret = pci_user_read_config_word(pdev, pos, &val);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		val = cpu_to_le16(val);
+		if (copy_to_user(buf + count - size, &val, 2))
+			return -EFAULT;
+
+		pos += 2;
+		size -= 2;
+	}
+
+	while (size) {
+		u8 val;
+
+		ret = pci_user_read_config_byte(pdev, pos, &val);
+		if (ret)
+			return pcibios_err_to_errno(ret);
+
+		if (copy_to_user(buf + count - size, &val, 1))
+			return -EFAULT;
+
+		pos++;
+		size--;
+	}
+
+	*ppos += count;
+
+	return count;
+}
+
+static void vfio_pci_igd_cfg_release(struct vfio_pci_device *vdev,
+				     struct vfio_pci_region *region)
+{
+	struct pci_dev *pdev = region->data;
+
+	pci_dev_put(pdev);
+}
+
+static const struct vfio_pci_regops vfio_pci_igd_cfg_regops = {
+	.rw		= vfio_pci_igd_cfg_rw,
+	.release	= vfio_pci_igd_cfg_release,
+};
+
+static int vfio_pci_igd_cfg_init(struct vfio_pci_device *vdev)
+{
+	struct pci_dev *host_bridge, *lpc_bridge;
+	int ret;
+
+	host_bridge = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0, 0));
+	if (!host_bridge)
+		return -ENODEV;
+
+	if (host_bridge->vendor != PCI_VENDOR_ID_INTEL ||
+	    host_bridge->class != (PCI_CLASS_BRIDGE_HOST << 8)) {
+		pci_dev_put(host_bridge);
+		return -EINVAL;
+	}
+
+	ret = vfio_pci_register_dev_region(vdev,
+		PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+		VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG,
+		&vfio_pci_igd_cfg_regops, host_bridge->cfg_size,
+		VFIO_REGION_INFO_FLAG_READ, host_bridge);
+	if (ret) {
+		pci_dev_put(host_bridge);
+		return ret;
+	}
+
+	lpc_bridge = pci_get_domain_bus_and_slot(0, 0, PCI_DEVFN(0x1f, 0));
+	if (!lpc_bridge)
+		return -ENODEV;
+
+	if (lpc_bridge->vendor != PCI_VENDOR_ID_INTEL ||
+	    lpc_bridge->class != (PCI_CLASS_BRIDGE_ISA << 8)) {
+		pci_dev_put(lpc_bridge);
+		return -EINVAL;
+	}
+
+	ret = vfio_pci_register_dev_region(vdev,
+		PCI_VENDOR_ID_INTEL | VFIO_REGION_TYPE_PCI_VENDOR_TYPE,
+		VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG,
+		&vfio_pci_igd_cfg_regops, lpc_bridge->cfg_size,
+		VFIO_REGION_INFO_FLAG_READ, lpc_bridge);
+	if (ret) {
+		pci_dev_put(lpc_bridge);
+		return ret;
+	}
+
+	return 0;
+}
+
+int vfio_pci_igd_init(struct vfio_pci_device *vdev)
+{
+	int ret;
+
+	ret = vfio_pci_igd_opregion_init(vdev);
+	if (ret)
+		return ret;
+
+	ret = vfio_pci_igd_cfg_init(vdev);
+	if (ret)
+		return ret;
+
+	return 0;
+}
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index 19f7699ac699..8a7d546d18a0 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -123,9 +123,9 @@ extern int vfio_pci_register_dev_region(struct vfio_pci_device *vdev,
 					const struct vfio_pci_regops *ops,
 					size_t size, u32 flags, void *data);
 #ifdef CONFIG_VFIO_PCI_IGD
-extern int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev);
+extern int vfio_pci_igd_init(struct vfio_pci_device *vdev);
 #else
-static inline int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev)
+static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev)
 {
 	return -ENODEV;
 }
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index e622982dbc53..255a2113f53c 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -283,7 +283,10 @@ struct vfio_region_info_cap_type {
 #define VFIO_REGION_TYPE_PCI_VENDOR_TYPE	(1 << 31)
 #define VFIO_REGION_TYPE_PCI_VENDOR_MASK	(0xffff)
 
+/* 8086 Vendor sub-types */
 #define VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION	(1)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG	(2)
+#define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG	(3)
 
 /**
  * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
-- 
cgit v1.2.3


From d4634e8dea13ccc969dd3f33dab3873cfdf3bc51 Mon Sep 17 00:00:00 2001
From: João Paulo Rechi Vita <jprvita@gmail.com>
Date: Tue, 19 Jan 2016 10:42:42 -0500
Subject: rfkill: Update userspace API documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a note to userspace on the effect of RFKILL_OP_CHANGE_ALL also
updating the default state for hotplugged devices.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
[reword a bit]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/rfkill.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/rfkill.h b/include/uapi/linux/rfkill.h
index 058757f7a733..2e00dcebebd0 100644
--- a/include/uapi/linux/rfkill.h
+++ b/include/uapi/linux/rfkill.h
@@ -59,6 +59,8 @@ enum rfkill_type {
  * @RFKILL_OP_DEL: a device was removed
  * @RFKILL_OP_CHANGE: a device's state changed -- userspace changes one device
  * @RFKILL_OP_CHANGE_ALL: userspace changes all devices (of a type, or all)
+ *	into a state, also updating the default state used for devices that
+ *	are hot-plugged later.
  */
 enum rfkill_operation {
 	RFKILL_OP_ADD = 0,
-- 
cgit v1.2.3


From 34d505193bd10668acf1caba02d2f66bddc23fea Mon Sep 17 00:00:00 2001
From: Lior David <liord@codeaurora.org>
Date: Thu, 28 Jan 2016 10:58:25 +0200
Subject: cfg80211: basic support for PBSS network type

PBSS (Personal Basic Service Set) is a new BSS type for DMG
networks. It is similar to infrastructure BSS, having an AP-like
entity called PCP (PBSS Control Point), but it has few differences.
PBSS support is mandatory for 11ad devices.

Add support for PBSS by introducing a new PBSS flag attribute.
The PBSS flag is used in the START_AP command to request starting
a PCP instead of an AP, and in the CONNECT command to request
connecting to a PCP instead of an AP.

Signed-off-by: Lior David <liord@codeaurora.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/ath/wil6210/cfg80211.c | 10 ++++++++++
 include/net/cfg80211.h                      |  8 ++++++++
 include/uapi/linux/nl80211.h                |  6 ++++++
 net/wireless/nl80211.c                      | 11 +++++++++++
 net/wireless/sme.c                          |  9 ++++++---
 5 files changed, 41 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c
index 20d07ef679e8..1f231cd08138 100644
--- a/drivers/net/wireless/ath/wil6210/cfg80211.c
+++ b/drivers/net/wireless/ath/wil6210/cfg80211.c
@@ -422,6 +422,11 @@ static int wil_cfg80211_connect(struct wiphy *wiphy,
 	if (sme->privacy && !rsn_eid)
 		wil_info(wil, "WSC connection\n");
 
+	if (sme->pbss) {
+		wil_err(wil, "connect - PBSS not yet supported\n");
+		return -EOPNOTSUPP;
+	}
+
 	bss = cfg80211_get_bss(wiphy, sme->channel, sme->bssid,
 			       sme->ssid, sme->ssid_len,
 			       IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY);
@@ -870,6 +875,11 @@ static int wil_cfg80211_start_ap(struct wiphy *wiphy,
 		return -EINVAL;
 	}
 
+	if (info->pbss) {
+		wil_err(wil, "AP: PBSS not yet supported\n");
+		return -EOPNOTSUPP;
+	}
+
 	switch (info->hidden_ssid) {
 	case NL80211_HIDDEN_SSID_NOT_IN_USE:
 		hidden_ssid = WMI_HIDDEN_SSID_DISABLED;
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 9bcaaf7cd15a..9e1b24c29f0c 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -712,6 +712,8 @@ struct cfg80211_acl_data {
  * @p2p_opp_ps: P2P opportunistic PS
  * @acl: ACL configuration used by the drivers which has support for
  *	MAC address based access control
+ * @pbss: If set, start as a PCP instead of AP. Relevant for DMG
+ *	networks.
  */
 struct cfg80211_ap_settings {
 	struct cfg80211_chan_def chandef;
@@ -730,6 +732,7 @@ struct cfg80211_ap_settings {
 	u8 p2p_ctwindow;
 	bool p2p_opp_ps;
 	const struct cfg80211_acl_data *acl;
+	bool pbss;
 };
 
 /**
@@ -1888,6 +1891,8 @@ struct cfg80211_ibss_params {
  * @ht_capa_mask:  The bits of ht_capa which are to be used.
  * @vht_capa:  VHT Capability overrides
  * @vht_capa_mask: The bits of vht_capa which are to be used.
+ * @pbss: if set, connect to a PCP instead of AP. Valid for DMG
+ *	networks.
  */
 struct cfg80211_connect_params {
 	struct ieee80211_channel *channel;
@@ -1910,6 +1915,7 @@ struct cfg80211_connect_params {
 	struct ieee80211_ht_cap ht_capa_mask;
 	struct ieee80211_vht_cap vht_capa;
 	struct ieee80211_vht_cap vht_capa_mask;
+	bool pbss;
 };
 
 /**
@@ -3489,6 +3495,7 @@ struct cfg80211_cached_keys;
  *	registered for unexpected class 3 frames (AP mode)
  * @conn: (private) cfg80211 software SME connection state machine data
  * @connect_keys: (private) keys to set after connection is established
+ * @conn_bss_type: connecting/connected BSS type
  * @ibss_fixed: (private) IBSS is using fixed BSSID
  * @ibss_dfs_possible: (private) IBSS may change to a DFS channel
  * @event_list: (private) list for internal event processing
@@ -3519,6 +3526,7 @@ struct wireless_dev {
 	u8 ssid_len, mesh_id_len, mesh_id_up_len;
 	struct cfg80211_conn *conn;
 	struct cfg80211_cached_keys *connect_keys;
+	enum ieee80211_bss_type conn_bss_type;
 
 	struct list_head event_list;
 	spinlock_t event_lock;
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 5b7b5ebe7ca8..7758969a2a8e 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1789,6 +1789,10 @@ enum nl80211_commands {
  *	thus it must not specify the number of iterations, only the interval
  *	between scans. The scan plans are executed sequentially.
  *	Each scan plan is a nested attribute of &enum nl80211_sched_scan_plan.
+ * @NL80211_ATTR_PBSS: flag attribute. If set it means operate
+ *	in a PBSS. Specified in %NL80211_CMD_CONNECT to request
+ *	connecting to a PCP, and in %NL80211_CMD_START_AP to start
+ *	a PCP instead of AP. Relevant for DMG networks only.
  *
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
@@ -2164,6 +2168,8 @@ enum nl80211_attrs {
 	NL80211_ATTR_MAX_SCAN_PLAN_ITERATIONS,
 	NL80211_ATTR_SCHED_SCAN_PLANS,
 
+	NL80211_ATTR_PBSS,
+
 	/* add attributes here, update the policy in nl80211.c */
 
 	__NL80211_ATTR_AFTER_LAST,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index d4786f2802aa..268cb493f6a5 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -401,6 +401,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
 	[NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 },
 	[NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 },
 	[NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG },
+	[NL80211_ATTR_PBSS] = { .type = NLA_FLAG },
 };
 
 /* policy for the key attributes */
@@ -3461,6 +3462,10 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
 			return PTR_ERR(params.acl);
 	}
 
+	params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
+	if (params.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ])
+		return -EOPNOTSUPP;
+
 	wdev_lock(wdev);
 	err = rdev_start_ap(rdev, dev, &params);
 	if (!err) {
@@ -7980,6 +7985,12 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 		connect.flags |= ASSOC_REQ_USE_RRM;
 	}
 
+	connect.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
+	if (connect.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) {
+		kzfree(connkeys);
+		return -EOPNOTSUPP;
+	}
+
 	wdev_lock(dev->ieee80211_ptr);
 	err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL);
 	wdev_unlock(dev->ieee80211_ptr);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 8020b5b094d4..79bd3a171caa 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -264,7 +264,7 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev)
 			       wdev->conn->params.bssid,
 			       wdev->conn->params.ssid,
 			       wdev->conn->params.ssid_len,
-			       IEEE80211_BSS_TYPE_ESS,
+			       wdev->conn_bss_type,
 			       IEEE80211_PRIVACY(wdev->conn->params.privacy));
 	if (!bss)
 		return NULL;
@@ -687,7 +687,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
 		WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect);
 		bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid,
 				       wdev->ssid, wdev->ssid_len,
-				       IEEE80211_BSS_TYPE_ESS,
+				       wdev->conn_bss_type,
 				       IEEE80211_PRIVACY_ANY);
 		if (bss)
 			cfg80211_hold_bss(bss_from_pub(bss));
@@ -846,7 +846,7 @@ void cfg80211_roamed(struct net_device *dev,
 
 	bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, wdev->ssid,
 			       wdev->ssid_len,
-			       IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY);
+			       wdev->conn_bss_type, IEEE80211_PRIVACY_ANY);
 	if (WARN_ON(!bss))
 		return;
 
@@ -1017,6 +1017,9 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
 	memcpy(wdev->ssid, connect->ssid, connect->ssid_len);
 	wdev->ssid_len = connect->ssid_len;
 
+	wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS :
+					      IEEE80211_BSS_TYPE_ESS;
+
 	if (!rdev->ops->connect)
 		err = cfg80211_sme_connect(wdev, connect, prev_bssid);
 	else
-- 
cgit v1.2.3


From 0c9ca11b1ae8eb16c1b6bbae91991392d2321372 Mon Sep 17 00:00:00 2001
From: Beni Lev <beni.lev@intel.com>
Date: Wed, 17 Feb 2016 20:30:00 +0200
Subject: cfg80211: Add global RRM capability

Today, the supplicant will add the RRM capabilities
Information Element in the association request only if
Quiet period is supported (NL80211_FEATURE_QUIET).

Quiet is one of many RRM features, and there are other RRM
features that are not related to Quiet (e.g. neighbor
report). Therefore, requiring Quiet to enable RRM is too
restrictive.
Some of the features, like neighbor report, can be
supported by user space without any help from the kernel.
Hence adding the RRM capabilities IE to association request
should be the sole user space's decision.
Removing the RRM dependency on Quiet in the driver solves
this problem, but using an old driver with a user space
tool that would not require Quiet feature would be
problematic: the user space would add NL80211_ATTR_USE_RRM
in the association request even if the kernel doesn't
advertize NL80211_FEATURE_QUIET and the association would
be denied by the kernel.

This solution adds a global RRM capability, that tells user
space that it can request RRM capabilities IE publishment
without any specific feature support in the kernel.

Signed-off-by: Beni Lev <beni.lev@intel.com>
Signed-off-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/uapi/linux/nl80211.h |  8 ++++++++
 net/wireless/nl80211.c       | 18 +++++++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index 7758969a2a8e..5a30a7563633 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -1727,6 +1727,8 @@ enum nl80211_commands {
  *	underlying device supports these minimal RRM features:
  *		%NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES,
  *		%NL80211_FEATURE_QUIET,
+ *	Or, if global RRM is supported, see:
+ *		%NL80211_EXT_FEATURE_RRM
  *	If this flag is used, driver must add the Power Capabilities IE to the
  *	association request. In addition, it must also set the RRM capability
  *	flag in the association request's Capability Info field.
@@ -4402,12 +4404,18 @@ enum nl80211_feature_flags {
 /**
  * enum nl80211_ext_feature_index - bit index of extended features.
  * @NL80211_EXT_FEATURE_VHT_IBSS: This driver supports IBSS with VHT datarates.
+ * @NL80211_EXT_FEATURE_RRM: This driver supports RRM. When featured, user can
+ *	can request to use RRM (see %NL80211_ATTR_USE_RRM) with
+ *	%NL80211_CMD_ASSOCIATE and %NL80211_CMD_CONNECT requests, which will set
+ *	the ASSOC_REQ_USE_RRM flag in the association request even if
+ *	NL80211_FEATURE_QUIET is not advertized.
  *
  * @NUM_NL80211_EXT_FEATURES: number of extended features.
  * @MAX_NL80211_EXT_FEATURES: highest extended feature index.
  */
 enum nl80211_ext_feature_index {
 	NL80211_EXT_FEATURE_VHT_IBSS,
+	NL80211_EXT_FEATURE_RRM,
 
 	/* add new features before the definition below */
 	NUM_NL80211_EXT_FEATURES,
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 268cb493f6a5..90890f183c0e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3,7 +3,7 @@
  *
  * Copyright 2006-2010	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
- * Copyright 2015	Intel Deutschland GmbH
+ * Copyright 2015-2016	Intel Deutschland GmbH
  */
 
 #include <linux/if.h>
@@ -7286,9 +7286,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
-		if (!(rdev->wiphy.features &
-		      NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
-		    !(rdev->wiphy.features & NL80211_FEATURE_QUIET))
+		if (!((rdev->wiphy.features &
+			NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) &&
+		       (rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
+		    !wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_RRM))
 			return -EINVAL;
 		req.flags |= ASSOC_REQ_USE_RRM;
 	}
@@ -7976,9 +7978,11 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
-		if (!(rdev->wiphy.features &
-		      NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
-		    !(rdev->wiphy.features & NL80211_FEATURE_QUIET)) {
+		if (!((rdev->wiphy.features &
+			NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) &&
+		       (rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
+		    !wiphy_ext_feature_isset(&rdev->wiphy,
+					     NL80211_EXT_FEATURE_RRM)) {
 			kzfree(connkeys);
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From 214338e372af2b856af07978daa771dbe087f990 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 25 Feb 2016 21:01:48 +0100
Subject: gpio: present the consumer of a line to userspace

I named the field representing the current user of GPIO line as
"label" but this is too vague and ambiguous. Before anyone gets
confused, rename it to "consumer" and indicate clearly in the
documentation that this is a string set by the user of the line.

Also clean up leftovers in the documentation.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib.c    |  8 ++++----
 include/uapi/linux/gpio.h | 22 ++++++++++++----------
 tools/gpio/lsgpio.c       |  6 +++---
 3 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index 872774a404f1..bc788b958c7e 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -368,11 +368,11 @@ static long gpio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			lineinfo.name[0] = '\0';
 		}
 		if (desc->label) {
-			strncpy(lineinfo.label, desc->label,
-				sizeof(lineinfo.label));
-			lineinfo.label[sizeof(lineinfo.label)-1] = '\0';
+			strncpy(lineinfo.consumer, desc->label,
+				sizeof(lineinfo.consumer));
+			lineinfo.consumer[sizeof(lineinfo.consumer)-1] = '\0';
 		} else {
-			lineinfo.label[0] = '\0';
+			lineinfo.consumer[0] = '\0';
 		}
 
 		/*
diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h
index 416ce47f2291..dfe8ade2742d 100644
--- a/include/uapi/linux/gpio.h
+++ b/include/uapi/linux/gpio.h
@@ -15,8 +15,9 @@
 
 /**
  * struct gpiochip_info - Information about a certain GPIO chip
- * @name: the name of this GPIO chip
- * @label: a functional name for this GPIO chip
+ * @name: the Linux kernel name of this GPIO chip
+ * @label: a functional name for this GPIO chip, such as a product
+ * number, may be NULL
  * @lines: number of GPIO lines on this chip
  */
 struct gpiochip_info {
@@ -34,20 +35,21 @@ struct gpiochip_info {
 
 /**
  * struct gpioline_info - Information about a certain GPIO line
- * @line_offset: the local offset on this GPIO device, fill in when
- * requesting information from the kernel
+ * @line_offset: the local offset on this GPIO device, fill this in when
+ * requesting the line information from the kernel
  * @flags: various flags for this line
- * @name: the name of this GPIO line
- * @label: a functional name for this GPIO line
- * @kernel: this GPIO is in use by the kernel
- * @out: this GPIO is an output line (false means it is an input)
- * @active_low: this GPIO is active low
+ * @name: the name of this GPIO line, such as the output pin of the line on the
+ * chip, a rail or a pin header name on a board, as specified by the gpio
+ * chip, may be NULL
+ * @consumer: a functional name for the consumer of this GPIO line as set by
+ * whatever is using it, will be NULL if there is no current user but may
+ * also be NULL if the consumer doesn't set this up
  */
 struct gpioline_info {
 	__u32 line_offset;
 	__u32 flags;
 	char name[32];
-	char label[32];
+	char consumer[32];
 };
 
 #define GPIO_GET_CHIPINFO_IOCTL _IOR('o', 0x01, struct gpiochip_info)
diff --git a/tools/gpio/lsgpio.c b/tools/gpio/lsgpio.c
index 6af118cc7efb..1124da375942 100644
--- a/tools/gpio/lsgpio.c
+++ b/tools/gpio/lsgpio.c
@@ -116,10 +116,10 @@ int list_device(const char *device_name)
 			fprintf(stdout, " \"%s\"", linfo.name);
 		else
 			fprintf(stdout, " unnamed");
-		if (linfo.label[0])
-			fprintf(stdout, " \"%s\"", linfo.label);
+		if (linfo.consumer[0])
+			fprintf(stdout, " \"%s\"", linfo.consumer);
 		else
-			fprintf(stdout, " unlabeled");
+			fprintf(stdout, " unused");
 		if (linfo.flags) {
 			fprintf(stdout, " [");
 			print_flags(linfo.flags);
-- 
cgit v1.2.3


From f1705ec197e705b79ea40fe7a2cc5acfa1d3bfac Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Wed, 24 Feb 2016 09:25:37 -0800
Subject: net: ipv6: Make address flushing on ifdown optional

Currently, all ipv6 addresses are flushed when the interface is configured
down, including global, static addresses:

    $ ip -6 addr show dev eth1
    3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 state UP qlen 1000
        inet6 2100:1::2/120 scope global
           valid_lft forever preferred_lft forever
        inet6 fe80::e0:f9ff:fe79:34bd/64 scope link
           valid_lft forever preferred_lft forever
    $ ip link set dev eth1 down
    $ ip -6 addr show dev eth1
    << nothing; all addresses have been flushed>>

Add a new sysctl to make this behavior optional. The new setting defaults to
flush all addresses to maintain backwards compatibility. When the set global
addresses with no expire times are not flushed on an admin down. The sysctl
is per-interface or system-wide for all interfaces

    $ sysctl -w net.ipv6.conf.eth1.keep_addr_on_down=1
or
    $ sysctl -w net.ipv6.conf.all.keep_addr_on_down=1

Will keep addresses on eth1 on an admin down.

    $ ip -6 addr show dev eth1
    3: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 state UP qlen 1000
        inet6 2100:1::2/120 scope global
           valid_lft forever preferred_lft forever
        inet6 fe80::e0:f9ff:fe79:34bd/64 scope link
           valid_lft forever preferred_lft forever
    $ ip link set dev eth1 down
    $ ip -6 addr show dev eth1
    3: eth1: <BROADCAST,MULTICAST> mtu 1500 state DOWN qlen 1000
        inet6 2100:1::2/120 scope global tentative
           valid_lft forever preferred_lft forever
        inet6 fe80::e0:f9ff:fe79:34bd/64 scope link tentative
           valid_lft forever preferred_lft forever

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |   9 +++
 include/linux/ipv6.h                   |   1 +
 include/uapi/linux/ipv6.h              |   1 +
 net/ipv6/addrconf.c                    | 136 +++++++++++++++++++++++++++++----
 4 files changed, 132 insertions(+), 15 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 24ce97f42d35..d5df40c75aa4 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1563,6 +1563,15 @@ temp_prefered_lft - INTEGER
 	Preferred lifetime (in seconds) for temporary addresses.
 	Default: 86400 (1 day)
 
+keep_addr_on_down - INTEGER
+	Keep all IPv6 addresses on an interface down event. If set static
+	global addresses with no expiration time are not flushed.
+	  >0 : enabled
+	   0 : system default
+	  <0 : disabled
+
+	Default: 0 (addresses are removed)
+
 max_desync_factor - INTEGER
 	Maximum value for DESYNC_FACTOR, which is a random value
 	that ensures that clients don't synchronize with each
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 4b2267e1b7c3..7edc14fb66b6 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -62,6 +62,7 @@ struct ipv6_devconf {
 		struct in6_addr secret;
 	} stable_secret;
 	__s32		use_oif_addrs_only;
+	__s32		keep_addr_on_down;
 	void		*sysctl;
 };
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index ec117b65d5a5..395876060f50 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -176,6 +176,7 @@ enum {
 	DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN,
 	DEVCONF_DROP_UNICAST_IN_L2_MULTICAST,
 	DEVCONF_DROP_UNSOLICITED_NA,
+	DEVCONF_KEEP_ADDR_ON_DOWN,
 	DEVCONF_MAX
 };
 
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 4751f8922362..a2d6f6c242af 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -216,6 +216,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
 	},
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
+	.keep_addr_on_down	= 0,
 };
 
 static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -260,6 +261,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
 	},
 	.use_oif_addrs_only	= 0,
 	.ignore_routes_with_linkdown = 0,
+	.keep_addr_on_down	= 0,
 };
 
 /* Check if a valid qdisc is available */
@@ -3168,6 +3170,55 @@ static void addrconf_gre_config(struct net_device *dev)
 }
 #endif
 
+static int fixup_permanent_addr(struct inet6_dev *idev,
+				struct inet6_ifaddr *ifp)
+{
+	if (!ifp->rt) {
+		struct rt6_info *rt;
+
+		rt = addrconf_dst_alloc(idev, &ifp->addr, false);
+		if (unlikely(IS_ERR(rt)))
+			return PTR_ERR(rt);
+
+		ifp->rt = rt;
+	}
+
+	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
+		addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+				      idev->dev, 0, 0);
+	}
+
+	addrconf_dad_start(ifp);
+
+	return 0;
+}
+
+static void addrconf_permanent_addr(struct net_device *dev)
+{
+	struct inet6_ifaddr *ifp, *tmp;
+	struct inet6_dev *idev;
+
+	idev = __in6_dev_get(dev);
+	if (!idev)
+		return;
+
+	write_lock_bh(&idev->lock);
+
+	list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
+		if ((ifp->flags & IFA_F_PERMANENT) &&
+		    fixup_permanent_addr(idev, ifp) < 0) {
+			write_unlock_bh(&idev->lock);
+			ipv6_del_addr(ifp);
+			write_lock_bh(&idev->lock);
+
+			net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n",
+					     idev->dev->name, &ifp->addr);
+		}
+	}
+
+	write_unlock_bh(&idev->lock);
+}
+
 static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			   void *ptr)
 {
@@ -3253,6 +3304,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
 			run_pending = 1;
 		}
 
+		/* restore routes for permanent addresses */
+		addrconf_permanent_addr(dev);
+
 		switch (dev->type) {
 #if IS_ENABLED(CONFIG_IPV6_SIT)
 		case ARPHRD_SIT:
@@ -3356,7 +3410,10 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 {
 	struct net *net = dev_net(dev);
 	struct inet6_dev *idev;
-	struct inet6_ifaddr *ifa;
+	struct inet6_ifaddr *ifa, *tmp;
+	struct list_head del_list;
+	int _keep_addr;
+	bool keep_addr;
 	int state, i;
 
 	ASSERT_RTNL();
@@ -3383,6 +3440,16 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 
 	}
 
+	/* aggregate the system setting and interface setting */
+	_keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
+	if (!_keep_addr)
+		_keep_addr = idev->cnf.keep_addr_on_down;
+
+	/* combine the user config with event to determine if permanent
+	 * addresses are to be removed from address hash table
+	 */
+	keep_addr = !(how || _keep_addr <= 0);
+
 	/* Step 2: clear hash table */
 	for (i = 0; i < IN6_ADDR_HSIZE; i++) {
 		struct hlist_head *h = &inet6_addr_lst[i];
@@ -3391,9 +3458,15 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 restart:
 		hlist_for_each_entry_rcu(ifa, h, addr_lst) {
 			if (ifa->idev == idev) {
-				hlist_del_init_rcu(&ifa->addr_lst);
 				addrconf_del_dad_work(ifa);
-				goto restart;
+				/* combined flag + permanent flag decide if
+				 * address is retained on a down event
+				 */
+				if (!keep_addr ||
+				    !(ifa->flags & IFA_F_PERMANENT)) {
+					hlist_del_init_rcu(&ifa->addr_lst);
+					goto restart;
+				}
 			}
 		}
 		spin_unlock_bh(&addrconf_hash_lock);
@@ -3427,31 +3500,53 @@ restart:
 		write_lock_bh(&idev->lock);
 	}
 
-	while (!list_empty(&idev->addr_list)) {
-		ifa = list_first_entry(&idev->addr_list,
-				       struct inet6_ifaddr, if_list);
-		addrconf_del_dad_work(ifa);
+	/* re-combine the user config with event to determine if permanent
+	 * addresses are to be removed from the interface list
+	 */
+	keep_addr = (!how && _keep_addr > 0);
 
-		list_del(&ifa->if_list);
+	INIT_LIST_HEAD(&del_list);
+	list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
+		addrconf_del_dad_work(ifa);
 
 		write_unlock_bh(&idev->lock);
-
 		spin_lock_bh(&ifa->lock);
-		state = ifa->state;
-		ifa->state = INET6_IFADDR_STATE_DEAD;
+
+		if (keep_addr && (ifa->flags & IFA_F_PERMANENT)) {
+			/* set state to skip the notifier below */
+			state = INET6_IFADDR_STATE_DEAD;
+			ifa->state = 0;
+			if (!(ifa->flags & IFA_F_NODAD))
+				ifa->flags |= IFA_F_TENTATIVE;
+		} else {
+			state = ifa->state;
+			ifa->state = INET6_IFADDR_STATE_DEAD;
+
+			list_del(&ifa->if_list);
+			list_add(&ifa->if_list, &del_list);
+		}
+
 		spin_unlock_bh(&ifa->lock);
 
 		if (state != INET6_IFADDR_STATE_DEAD) {
 			__ipv6_ifa_notify(RTM_DELADDR, ifa);
 			inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
 		}
-		in6_ifa_put(ifa);
 
 		write_lock_bh(&idev->lock);
 	}
 
 	write_unlock_bh(&idev->lock);
 
+	/* now clean up addresses to be removed */
+	while (!list_empty(&del_list)) {
+		ifa = list_first_entry(&del_list,
+				       struct inet6_ifaddr, if_list);
+		list_del(&ifa->if_list);
+
+		in6_ifa_put(ifa);
+	}
+
 	/* Step 5: Discard anycast and multicast list */
 	if (how) {
 		ipv6_ac_destroy_dev(idev);
@@ -4716,6 +4811,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
 	array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
 	array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
 	array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
+	array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
 }
 
 static inline size_t inet6_ifla6_size(void)
@@ -5197,10 +5293,12 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
 			if (rt)
 				ip6_del_rt(rt);
 		}
-		dst_hold(&ifp->rt->dst);
-
-		ip6_del_rt(ifp->rt);
+		if (ifp->rt) {
+			dst_hold(&ifp->rt->dst);
 
+			ip6_del_rt(ifp->rt);
+			ifp->rt = NULL;
+		}
 		rt_genid_bump_ipv6(net);
 		break;
 	}
@@ -5803,6 +5901,14 @@ static struct addrconf_sysctl_table
 			.mode		= 0644,
 			.proc_handler	= proc_dointvec,
 		},
+		{
+			.procname       = "keep_addr_on_down",
+			.data           = &ipv6_devconf.keep_addr_on_down,
+			.maxlen         = sizeof(int),
+			.mode           = 0644,
+			.proc_handler   = proc_dointvec,
+
+		},
 		{
 			/* sentinel */
 		}
-- 
cgit v1.2.3


From 3f1ac7a700d039c61d8d8b99f28d605d489a60cf Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Wed, 24 Feb 2016 10:57:59 -0800
Subject: net: ethtool: add new ETHTOOL_xLINKSETTINGS API

This patch defines a new ETHTOOL_GLINKSETTINGS/SLINKSETTINGS API,
handled by the new get_link_ksettings/set_link_ksettings callbacks.
This API provides support for most legacy ethtool_cmd fields, adds
support for larger link mode masks (up to 4064 bits, variable length),
and removes ethtool_cmd deprecated
fields (transceiver/maxrxpkt/maxtxpkt).

This API is deprecating the legacy ETHTOOL_GSET/SSET API and provides
the following backward compatibility properties:
 - legacy ethtool with legacy drivers: no change, still using the
   get_settings/set_settings callbacks.
 - legacy ethtool with new get/set_link_ksettings drivers: the new
   driver callbacks are used, data internally converted to legacy
   ethtool_cmd. ETHTOOL_GSET will return only the 1st 32b of each link
   mode mask. ETHTOOL_SSET will fail if user tries to set the
   ethtool_cmd deprecated fields to
   non-0 (transceiver/maxrxpkt/maxtxpkt). A kernel warning is logged if
   driver sets higher bits.
 - future ethtool with legacy drivers: no change, still using the
   get_settings/set_settings callbacks, internally converted to new data
   structure. Deprecated fields (transceiver/maxrxpkt/maxtxpkt) will be
   ignored and seen as 0 from user space. Note that that "future"
   ethtool tool will not allow changes to these deprecated fields.
 - future ethtool with new drivers: direct call to the new callbacks.

By "future" ethtool, what is meant is:
 - query: first try ETHTOOL_GLINKSETTINGS, and revert to ETHTOOL_GSET if
   fails
 - set: query first and remember which of ETHTOOL_GLINKSETTINGS or
   ETHTOOL_GSET was successful
   + if ETHTOOL_GLINKSETTINGS was successful, then change config with
     ETHTOOL_SLINKSETTINGS. A failure there is final (do not try
     ETHTOOL_SSET).
   + otherwise ETHTOOL_GSET was successful, change config with
     ETHTOOL_SSET. A failure there is final (do not try
     ETHTOOL_SLINKSETTINGS).

The interaction user/kernel via the new API requires a small
ETHTOOL_GLINKSETTINGS handshake first to agree on the length of the link
mode bitmaps. If kernel doesn't agree with user, it returns the bitmap
length it is expecting from user as a negative length (and cmd field is
0). When kernel and user agree, kernel returns valid info in all
fields (ie. link mode length > 0 and cmd is ETHTOOL_GLINKSETTINGS).

Data structure crossing user/kernel boundary is 32/64-bit
agnostic. Converted internally to a legal kernel bitmap.

The internal __ethtool_get_settings kernel helper will gradually be
replaced by __ethtool_get_link_ksettings by the time the first
"link_settings" drivers start to appear. So this patch doesn't change
it, it will be removed before it needs to be changed.

Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ethtool.h      |  91 ++++++++-
 include/uapi/linux/ethtool.h | 322 +++++++++++++++++++++++-------
 net/core/ethtool.c           | 453 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 786 insertions(+), 80 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 472d7d7b01c2..8a400a54c92e 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -12,6 +12,7 @@
 #ifndef _LINUX_ETHTOOL_H
 #define _LINUX_ETHTOOL_H
 
+#include <linux/bitmap.h>
 #include <linux/compat.h>
 #include <uapi/linux/ethtool.h>
 
@@ -40,9 +41,6 @@ struct compat_ethtool_rxnfc {
 
 #include <linux/rculist.h>
 
-extern int __ethtool_get_settings(struct net_device *dev,
-				  struct ethtool_cmd *cmd);
-
 /**
  * enum ethtool_phys_id_state - indicator state for physical identification
  * @ETHTOOL_ID_INACTIVE: Physical ID indicator should be deactivated
@@ -97,13 +95,74 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
 	return index % n_rx_rings;
 }
 
+/* number of link mode bits/ulongs handled internally by kernel */
+#define __ETHTOOL_LINK_MODE_MASK_NBITS			\
+	(__ETHTOOL_LINK_MODE_LAST + 1)
+
+/* declare a link mode bitmap */
+#define __ETHTOOL_DECLARE_LINK_MODE_MASK(name)		\
+	DECLARE_BITMAP(name, __ETHTOOL_LINK_MODE_MASK_NBITS)
+
+/* drivers must ignore base.cmd and base.link_mode_masks_nwords
+ * fields, but they are allowed to overwrite them (will be ignored).
+ */
+struct ethtool_link_ksettings {
+	struct ethtool_link_settings base;
+	struct {
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(supported);
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(advertising);
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising);
+	} link_modes;
+};
+
+/**
+ * ethtool_link_ksettings_zero_link_mode - clear link_ksettings link mode mask
+ *   @ptr : pointer to struct ethtool_link_ksettings
+ *   @name : one of supported/advertising/lp_advertising
+ */
+#define ethtool_link_ksettings_zero_link_mode(ptr, name)		\
+	bitmap_zero((ptr)->link_modes.name, __ETHTOOL_LINK_MODE_MASK_NBITS)
+
+/**
+ * ethtool_link_ksettings_add_link_mode - set bit in link_ksettings
+ * link mode mask
+ *   @ptr : pointer to struct ethtool_link_ksettings
+ *   @name : one of supported/advertising/lp_advertising
+ *   @mode : one of the ETHTOOL_LINK_MODE_*_BIT
+ * (not atomic, no bound checking)
+ */
+#define ethtool_link_ksettings_add_link_mode(ptr, name, mode)		\
+	__set_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name)
+
+/**
+ * ethtool_link_ksettings_test_link_mode - test bit in ksettings link mode mask
+ *   @ptr : pointer to struct ethtool_link_ksettings
+ *   @name : one of supported/advertising/lp_advertising
+ *   @mode : one of the ETHTOOL_LINK_MODE_*_BIT
+ * (not atomic, no bound checking)
+ *
+ * Returns true/false.
+ */
+#define ethtool_link_ksettings_test_link_mode(ptr, name, mode)		\
+	test_bit(ETHTOOL_LINK_MODE_ ## mode ## _BIT, (ptr)->link_modes.name)
+
+extern int
+__ethtool_get_link_ksettings(struct net_device *dev,
+			     struct ethtool_link_ksettings *link_ksettings);
+
+/* DEPRECATED, use __ethtool_get_link_ksettings */
+extern int __ethtool_get_settings(struct net_device *dev,
+				  struct ethtool_cmd *cmd);
+
 /**
  * struct ethtool_ops - optional netdev operations
- * @get_settings: Get various device settings including Ethernet link
+ * @get_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
+ *	API. Get various device settings including Ethernet link
  *	settings. The @cmd parameter is expected to have been cleared
- *	before get_settings is called. Returns a negative error code or
- *	zero.
- * @set_settings: Set various device settings including Ethernet link
+ *	before get_settings is called. Returns a negative error code
+ *	or zero.
+ * @set_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings
+ *	API. Set various device settings including Ethernet link
  *	settings.  Returns a negative error code or zero.
  * @get_drvinfo: Report driver/device information.  Should only set the
  *	@driver, @version, @fw_version and @bus_info fields.  If not
@@ -211,6 +270,19 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings)
  *	a TX queue has this number, return -EINVAL. If only a RX queue or a TX
  *	queue has this number, ignore the inapplicable fields.
  *	Returns a negative error code or zero.
+ * @get_link_ksettings: When defined, takes precedence over the
+ *	%get_settings method. Get various device settings
+ *	including Ethernet link settings. The %cmd and
+ *	%link_mode_masks_nwords fields should be ignored (use
+ *	%__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), any
+ *	change to them will be overwritten by kernel. Returns a
+ *	negative error code or zero.
+ * @set_link_ksettings: When defined, takes precedence over the
+ *	%set_settings method. Set various device settings including
+ *	Ethernet link settings. The %cmd and %link_mode_masks_nwords
+ *	fields should be ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS
+ *	instead of the latter), any change to them will be overwritten
+ *	by kernel. Returns a negative error code or zero.
  *
  * All operations are optional (i.e. the function pointer may be set
  * to %NULL) and callers must take this into account.  Callers must
@@ -293,6 +365,9 @@ struct ethtool_ops {
 					  struct ethtool_coalesce *);
 	int	(*set_per_queue_coalesce)(struct net_device *, u32,
 					  struct ethtool_coalesce *);
-
+	int	(*get_link_ksettings)(struct net_device *,
+				      struct ethtool_link_ksettings *);
+	int	(*set_link_ksettings)(struct net_device *,
+				      const struct ethtool_link_ksettings *);
 };
 #endif /* _LINUX_ETHTOOL_H */
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index f15ae02621a1..37fd6dc33de4 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -21,7 +21,8 @@
  */
 
 /**
- * struct ethtool_cmd - link control and status
+ * struct ethtool_cmd - DEPRECATED, link control and status
+ * This structure is DEPRECATED, please use struct ethtool_link_settings.
  * @cmd: Command number = %ETHTOOL_GSET or %ETHTOOL_SSET
  * @supported: Bitmask of %SUPPORTED_* flags for the link modes,
  *	physical connectors and other link features for which the
@@ -1219,8 +1220,12 @@ struct ethtool_per_queue_op {
 };
 
 /* CMDs currently supported */
-#define ETHTOOL_GSET		0x00000001 /* Get settings. */
-#define ETHTOOL_SSET		0x00000002 /* Set settings. */
+#define ETHTOOL_GSET		0x00000001 /* DEPRECATED, Get settings.
+					    * Please use ETHTOOL_GLINKSETTINGS
+					    */
+#define ETHTOOL_SSET		0x00000002 /* DEPRECATED, Set settings.
+					    * Please use ETHTOOL_SLINKSETTINGS
+					    */
 #define ETHTOOL_GDRVINFO	0x00000003 /* Get driver info. */
 #define ETHTOOL_GREGS		0x00000004 /* Get NIC registers. */
 #define ETHTOOL_GWOL		0x00000005 /* Get wake-on-lan options. */
@@ -1302,73 +1307,139 @@ struct ethtool_per_queue_op {
 
 #define ETHTOOL_PERQUEUE	0x0000004b /* Set per queue options */
 
+#define ETHTOOL_GLINKSETTINGS	0x0000004c /* Get ethtool_link_settings */
+#define ETHTOOL_SLINKSETTINGS	0x0000004d /* Set ethtool_link_settings */
+
+
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
 #define SPARC_ETH_SSET		ETHTOOL_SSET
 
-#define SUPPORTED_10baseT_Half		(1 << 0)
-#define SUPPORTED_10baseT_Full		(1 << 1)
-#define SUPPORTED_100baseT_Half		(1 << 2)
-#define SUPPORTED_100baseT_Full		(1 << 3)
-#define SUPPORTED_1000baseT_Half	(1 << 4)
-#define SUPPORTED_1000baseT_Full	(1 << 5)
-#define SUPPORTED_Autoneg		(1 << 6)
-#define SUPPORTED_TP			(1 << 7)
-#define SUPPORTED_AUI			(1 << 8)
-#define SUPPORTED_MII			(1 << 9)
-#define SUPPORTED_FIBRE			(1 << 10)
-#define SUPPORTED_BNC			(1 << 11)
-#define SUPPORTED_10000baseT_Full	(1 << 12)
-#define SUPPORTED_Pause			(1 << 13)
-#define SUPPORTED_Asym_Pause		(1 << 14)
-#define SUPPORTED_2500baseX_Full	(1 << 15)
-#define SUPPORTED_Backplane		(1 << 16)
-#define SUPPORTED_1000baseKX_Full	(1 << 17)
-#define SUPPORTED_10000baseKX4_Full	(1 << 18)
-#define SUPPORTED_10000baseKR_Full	(1 << 19)
-#define SUPPORTED_10000baseR_FEC	(1 << 20)
-#define SUPPORTED_20000baseMLD2_Full	(1 << 21)
-#define SUPPORTED_20000baseKR2_Full	(1 << 22)
-#define SUPPORTED_40000baseKR4_Full	(1 << 23)
-#define SUPPORTED_40000baseCR4_Full	(1 << 24)
-#define SUPPORTED_40000baseSR4_Full	(1 << 25)
-#define SUPPORTED_40000baseLR4_Full	(1 << 26)
-#define SUPPORTED_56000baseKR4_Full	(1 << 27)
-#define SUPPORTED_56000baseCR4_Full	(1 << 28)
-#define SUPPORTED_56000baseSR4_Full	(1 << 29)
-#define SUPPORTED_56000baseLR4_Full	(1 << 30)
-
-#define ADVERTISED_10baseT_Half		(1 << 0)
-#define ADVERTISED_10baseT_Full		(1 << 1)
-#define ADVERTISED_100baseT_Half	(1 << 2)
-#define ADVERTISED_100baseT_Full	(1 << 3)
-#define ADVERTISED_1000baseT_Half	(1 << 4)
-#define ADVERTISED_1000baseT_Full	(1 << 5)
-#define ADVERTISED_Autoneg		(1 << 6)
-#define ADVERTISED_TP			(1 << 7)
-#define ADVERTISED_AUI			(1 << 8)
-#define ADVERTISED_MII			(1 << 9)
-#define ADVERTISED_FIBRE		(1 << 10)
-#define ADVERTISED_BNC			(1 << 11)
-#define ADVERTISED_10000baseT_Full	(1 << 12)
-#define ADVERTISED_Pause		(1 << 13)
-#define ADVERTISED_Asym_Pause		(1 << 14)
-#define ADVERTISED_2500baseX_Full	(1 << 15)
-#define ADVERTISED_Backplane		(1 << 16)
-#define ADVERTISED_1000baseKX_Full	(1 << 17)
-#define ADVERTISED_10000baseKX4_Full	(1 << 18)
-#define ADVERTISED_10000baseKR_Full	(1 << 19)
-#define ADVERTISED_10000baseR_FEC	(1 << 20)
-#define ADVERTISED_20000baseMLD2_Full	(1 << 21)
-#define ADVERTISED_20000baseKR2_Full	(1 << 22)
-#define ADVERTISED_40000baseKR4_Full	(1 << 23)
-#define ADVERTISED_40000baseCR4_Full	(1 << 24)
-#define ADVERTISED_40000baseSR4_Full	(1 << 25)
-#define ADVERTISED_40000baseLR4_Full	(1 << 26)
-#define ADVERTISED_56000baseKR4_Full	(1 << 27)
-#define ADVERTISED_56000baseCR4_Full	(1 << 28)
-#define ADVERTISED_56000baseSR4_Full	(1 << 29)
-#define ADVERTISED_56000baseLR4_Full	(1 << 30)
+/* Link mode bit indices */
+enum ethtool_link_mode_bit_indices {
+	ETHTOOL_LINK_MODE_10baseT_Half_BIT	= 0,
+	ETHTOOL_LINK_MODE_10baseT_Full_BIT	= 1,
+	ETHTOOL_LINK_MODE_100baseT_Half_BIT	= 2,
+	ETHTOOL_LINK_MODE_100baseT_Full_BIT	= 3,
+	ETHTOOL_LINK_MODE_1000baseT_Half_BIT	= 4,
+	ETHTOOL_LINK_MODE_1000baseT_Full_BIT	= 5,
+	ETHTOOL_LINK_MODE_Autoneg_BIT		= 6,
+	ETHTOOL_LINK_MODE_TP_BIT		= 7,
+	ETHTOOL_LINK_MODE_AUI_BIT		= 8,
+	ETHTOOL_LINK_MODE_MII_BIT		= 9,
+	ETHTOOL_LINK_MODE_FIBRE_BIT		= 10,
+	ETHTOOL_LINK_MODE_BNC_BIT		= 11,
+	ETHTOOL_LINK_MODE_10000baseT_Full_BIT	= 12,
+	ETHTOOL_LINK_MODE_Pause_BIT		= 13,
+	ETHTOOL_LINK_MODE_Asym_Pause_BIT	= 14,
+	ETHTOOL_LINK_MODE_2500baseX_Full_BIT	= 15,
+	ETHTOOL_LINK_MODE_Backplane_BIT		= 16,
+	ETHTOOL_LINK_MODE_1000baseKX_Full_BIT	= 17,
+	ETHTOOL_LINK_MODE_10000baseKX4_Full_BIT	= 18,
+	ETHTOOL_LINK_MODE_10000baseKR_Full_BIT	= 19,
+	ETHTOOL_LINK_MODE_10000baseR_FEC_BIT	= 20,
+	ETHTOOL_LINK_MODE_20000baseMLD2_Full_BIT = 21,
+	ETHTOOL_LINK_MODE_20000baseKR2_Full_BIT	= 22,
+	ETHTOOL_LINK_MODE_40000baseKR4_Full_BIT	= 23,
+	ETHTOOL_LINK_MODE_40000baseCR4_Full_BIT	= 24,
+	ETHTOOL_LINK_MODE_40000baseSR4_Full_BIT	= 25,
+	ETHTOOL_LINK_MODE_40000baseLR4_Full_BIT	= 26,
+	ETHTOOL_LINK_MODE_56000baseKR4_Full_BIT	= 27,
+	ETHTOOL_LINK_MODE_56000baseCR4_Full_BIT	= 28,
+	ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT	= 29,
+	ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT	= 30,
+
+	/* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit
+	 * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_*
+	 * macro for bits > 31. The only way to use indices > 31 is to
+	 * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API.
+	 */
+
+	__ETHTOOL_LINK_MODE_LAST
+	  = ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT,
+};
+
+#define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name)	\
+	(1UL << (ETHTOOL_LINK_MODE_ ## base_name ## _BIT))
+
+/* DEPRECATED macros. Please migrate to
+ * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT
+ * define any new SUPPORTED_* macro for bits > 31.
+ */
+#define SUPPORTED_10baseT_Half		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half)
+#define SUPPORTED_10baseT_Full		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full)
+#define SUPPORTED_100baseT_Half		__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half)
+#define SUPPORTED_100baseT_Full		__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full)
+#define SUPPORTED_1000baseT_Half	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half)
+#define SUPPORTED_1000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full)
+#define SUPPORTED_Autoneg		__ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg)
+#define SUPPORTED_TP			__ETHTOOL_LINK_MODE_LEGACY_MASK(TP)
+#define SUPPORTED_AUI			__ETHTOOL_LINK_MODE_LEGACY_MASK(AUI)
+#define SUPPORTED_MII			__ETHTOOL_LINK_MODE_LEGACY_MASK(MII)
+#define SUPPORTED_FIBRE			__ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE)
+#define SUPPORTED_BNC			__ETHTOOL_LINK_MODE_LEGACY_MASK(BNC)
+#define SUPPORTED_10000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full)
+#define SUPPORTED_Pause			__ETHTOOL_LINK_MODE_LEGACY_MASK(Pause)
+#define SUPPORTED_Asym_Pause		__ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause)
+#define SUPPORTED_2500baseX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full)
+#define SUPPORTED_Backplane		__ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane)
+#define SUPPORTED_1000baseKX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full)
+#define SUPPORTED_10000baseKX4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full)
+#define SUPPORTED_10000baseKR_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full)
+#define SUPPORTED_10000baseR_FEC	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC)
+#define SUPPORTED_20000baseMLD2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full)
+#define SUPPORTED_20000baseKR2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full)
+#define SUPPORTED_40000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full)
+#define SUPPORTED_40000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full)
+#define SUPPORTED_40000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full)
+#define SUPPORTED_40000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full)
+#define SUPPORTED_56000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full)
+#define SUPPORTED_56000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full)
+#define SUPPORTED_56000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full)
+#define SUPPORTED_56000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full)
+/* Please do not define any new SUPPORTED_* macro for bits > 31, see
+ * notice above.
+ */
+
+/*
+ * DEPRECATED macros. Please migrate to
+ * ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. Please do NOT
+ * define any new ADERTISE_* macro for bits > 31.
+ */
+#define ADVERTISED_10baseT_Half		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Half)
+#define ADVERTISED_10baseT_Full		__ETHTOOL_LINK_MODE_LEGACY_MASK(10baseT_Full)
+#define ADVERTISED_100baseT_Half	__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Half)
+#define ADVERTISED_100baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(100baseT_Full)
+#define ADVERTISED_1000baseT_Half	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Half)
+#define ADVERTISED_1000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseT_Full)
+#define ADVERTISED_Autoneg		__ETHTOOL_LINK_MODE_LEGACY_MASK(Autoneg)
+#define ADVERTISED_TP			__ETHTOOL_LINK_MODE_LEGACY_MASK(TP)
+#define ADVERTISED_AUI			__ETHTOOL_LINK_MODE_LEGACY_MASK(AUI)
+#define ADVERTISED_MII			__ETHTOOL_LINK_MODE_LEGACY_MASK(MII)
+#define ADVERTISED_FIBRE		__ETHTOOL_LINK_MODE_LEGACY_MASK(FIBRE)
+#define ADVERTISED_BNC			__ETHTOOL_LINK_MODE_LEGACY_MASK(BNC)
+#define ADVERTISED_10000baseT_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseT_Full)
+#define ADVERTISED_Pause		__ETHTOOL_LINK_MODE_LEGACY_MASK(Pause)
+#define ADVERTISED_Asym_Pause		__ETHTOOL_LINK_MODE_LEGACY_MASK(Asym_Pause)
+#define ADVERTISED_2500baseX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(2500baseX_Full)
+#define ADVERTISED_Backplane		__ETHTOOL_LINK_MODE_LEGACY_MASK(Backplane)
+#define ADVERTISED_1000baseKX_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(1000baseKX_Full)
+#define ADVERTISED_10000baseKX4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKX4_Full)
+#define ADVERTISED_10000baseKR_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseKR_Full)
+#define ADVERTISED_10000baseR_FEC	__ETHTOOL_LINK_MODE_LEGACY_MASK(10000baseR_FEC)
+#define ADVERTISED_20000baseMLD2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseMLD2_Full)
+#define ADVERTISED_20000baseKR2_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(20000baseKR2_Full)
+#define ADVERTISED_40000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseKR4_Full)
+#define ADVERTISED_40000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseCR4_Full)
+#define ADVERTISED_40000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseSR4_Full)
+#define ADVERTISED_40000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(40000baseLR4_Full)
+#define ADVERTISED_56000baseKR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseKR4_Full)
+#define ADVERTISED_56000baseCR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseCR4_Full)
+#define ADVERTISED_56000baseSR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseSR4_Full)
+#define ADVERTISED_56000baseLR4_Full	__ETHTOOL_LINK_MODE_LEGACY_MASK(56000baseLR4_Full)
+/* Please do not define any new ADVERTISED_* macro for bits > 31, see
+ * notice above.
+ */
 
 /* The following are all involved in forcing a particular link
  * mode for the device for setting things.  When getting the
@@ -1533,4 +1604,123 @@ enum ethtool_reset_flags {
 };
 #define ETH_RESET_SHARED_SHIFT	16
 
+
+/**
+ * struct ethtool_link_settings - link control and status
+ *
+ * IMPORTANT, Backward compatibility notice: When implementing new
+ *	user-space tools, please first try %ETHTOOL_GLINKSETTINGS, and
+ *	if it succeeds use %ETHTOOL_SLINKSETTINGS to change link
+ *	settings; do not use %ETHTOOL_SSET if %ETHTOOL_GLINKSETTINGS
+ *	succeeded: stick to %ETHTOOL_GLINKSETTINGS/%SLINKSETTINGS in
+ *	that case.  Conversely, if %ETHTOOL_GLINKSETTINGS fails, use
+ *	%ETHTOOL_GSET to query and %ETHTOOL_SSET to change link
+ *	settings; do not use %ETHTOOL_SLINKSETTINGS if
+ *	%ETHTOOL_GLINKSETTINGS failed: stick to
+ *	%ETHTOOL_GSET/%ETHTOOL_SSET in that case.
+ *
+ * @cmd: Command number = %ETHTOOL_GLINKSETTINGS or %ETHTOOL_SLINKSETTINGS
+ * @speed: Link speed (Mbps)
+ * @duplex: Duplex mode; one of %DUPLEX_*
+ * @port: Physical connector type; one of %PORT_*
+ * @phy_address: MDIO address of PHY (transceiver); 0 or 255 if not
+ *	applicable.  For clause 45 PHYs this is the PRTAD.
+ * @autoneg: Enable/disable autonegotiation and auto-detection;
+ *	either %AUTONEG_DISABLE or %AUTONEG_ENABLE
+ * @mdio_support: Bitmask of %ETH_MDIO_SUPPORTS_* flags for the MDIO
+ *	protocols supported by the interface; 0 if unknown.
+ *	Read-only.
+ * @eth_tp_mdix: Ethernet twisted-pair MDI(-X) status; one of
+ *	%ETH_TP_MDI_*.  If the status is unknown or not applicable, the
+ *	value will be %ETH_TP_MDI_INVALID.  Read-only.
+ * @eth_tp_mdix_ctrl: Ethernet twisted pair MDI(-X) control; one of
+ *	%ETH_TP_MDI_*.  If MDI(-X) control is not implemented, reads
+ *	yield %ETH_TP_MDI_INVALID and writes may be ignored or rejected.
+ *	When written successfully, the link should be renegotiated if
+ *	necessary.
+ * @link_mode_masks_nwords: Number of 32-bit words for each of the
+ *	supported, advertising, lp_advertising link mode bitmaps. For
+ *	%ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user
+ *	(>= 0); on return, if handshake in progress, negative if
+ *	request size unsupported by kernel: absolute value indicates
+ *	kernel recommended size and cmd field is 0, as well as all the
+ *	other fields; otherwise (handshake completed), strictly
+ *	positive to indicate size used by kernel and cmd field is
+ *	%ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For
+ *	%ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive
+ *	value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise
+ *	refused. For drivers: ignore this field (use kernel's
+ *	__ETHTOOL_LINK_MODE_MASK_NBITS instead), any change to it will
+ *	be overwritten by kernel.
+ * @supported: Bitmap with each bit meaning given by
+ *	%ethtool_link_mode_bit_indices for the link modes, physical
+ *	connectors and other link features for which the interface
+ *	supports autonegotiation or auto-detection.  Read-only.
+ * @advertising: Bitmap with each bit meaning given by
+ *	%ethtool_link_mode_bit_indices for the link modes, physical
+ *	connectors and other link features that are advertised through
+ *	autonegotiation or enabled for auto-detection.
+ * @lp_advertising: Bitmap with each bit meaning given by
+ *	%ethtool_link_mode_bit_indices for the link modes, and other
+ *	link features that the link partner advertised through
+ *	autonegotiation; 0 if unknown or not applicable.  Read-only.
+ *
+ * If autonegotiation is disabled, the speed and @duplex represent the
+ * fixed link mode and are writable if the driver supports multiple
+ * link modes.  If it is enabled then they are read-only; if the link
+ * is up they represent the negotiated link mode; if the link is down,
+ * the speed is 0, %SPEED_UNKNOWN or the highest enabled speed and
+ * @duplex is %DUPLEX_UNKNOWN or the best enabled duplex mode.
+ *
+ * Some hardware interfaces may have multiple PHYs and/or physical
+ * connectors fitted or do not allow the driver to detect which are
+ * fitted.  For these interfaces @port and/or @phy_address may be
+ * writable, possibly dependent on @autoneg being %AUTONEG_DISABLE.
+ * Otherwise, attempts to write different values may be ignored or
+ * rejected.
+ *
+ * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt
+ * are not available in %ethtool_link_settings. Until all drivers are
+ * converted to ignore them or to the new %ethtool_link_settings API,
+ * for both queries and changes, users should always try
+ * %ETHTOOL_GLINKSETTINGS first, and if it fails with -ENOTSUPP stick
+ * only to %ETHTOOL_GSET and %ETHTOOL_SSET consistently. If it
+ * succeeds, then users should stick to %ETHTOOL_GLINKSETTINGS and
+ * %ETHTOOL_SLINKSETTINGS (which would support drivers implementing
+ * either %ethtool_cmd or %ethtool_link_settings).
+ *
+ * Users should assume that all fields not marked read-only are
+ * writable and subject to validation by the driver.  They should use
+ * %ETHTOOL_GLINKSETTINGS to get the current values before making specific
+ * changes and then applying them with %ETHTOOL_SLINKSETTINGS.
+ *
+ * Drivers that implement %get_link_ksettings and/or
+ * %set_link_ksettings should ignore the @cmd
+ * and @link_mode_masks_nwords fields (any change to them overwritten
+ * by kernel), and rely only on kernel's internal
+ * %__ETHTOOL_LINK_MODE_MASK_NBITS and
+ * %ethtool_link_mode_mask_t. Drivers that implement
+ * %set_link_ksettings() should validate all fields other than @cmd
+ * and @link_mode_masks_nwords that are not described as read-only or
+ * deprecated, and must ignore all fields described as read-only.
+ */
+struct ethtool_link_settings {
+	__u32	cmd;
+	__u32	speed;
+	__u8	duplex;
+	__u8	port;
+	__u8	phy_address;
+	__u8	autoneg;
+	__u8	mdio_support;
+	__u8	eth_tp_mdix;
+	__u8	eth_tp_mdix_ctrl;
+	__s8	link_mode_masks_nwords;
+	__u32	reserved[8];
+	__u32	link_mode_masks[0];
+	/* layout of link_mode_masks fields:
+	 * __u32 map_supported[link_mode_masks_nwords];
+	 * __u32 map_advertising[link_mode_masks_nwords];
+	 * __u32 map_lp_advertising[link_mode_masks_nwords];
+	 */
+};
 #endif /* _UAPI_LINUX_ETHTOOL_H */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 2406101002b1..edcec56ed228 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -387,6 +387,359 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
 	return 0;
 }
 
+static void convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32)
+{
+	bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
+	dst[0] = legacy_u32;
+}
+
+/* return false if src had higher bits set. lower bits always updated. */
+static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+					    const unsigned long *src)
+{
+	bool retval = true;
+
+	/* TODO: following test will soon always be true */
+	if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) {
+		__ETHTOOL_DECLARE_LINK_MODE_MASK(ext);
+
+		bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+		bitmap_fill(ext, 32);
+		bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+		if (bitmap_intersects(ext, src,
+				      __ETHTOOL_LINK_MODE_MASK_NBITS)) {
+			/* src mask goes beyond bit 31 */
+			retval = false;
+		}
+	}
+	*legacy_u32 = src[0];
+	return retval;
+}
+
+/* return false if legacy contained non-0 deprecated fields
+ * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated
+ */
+static bool
+convert_legacy_settings_to_link_ksettings(
+	struct ethtool_link_ksettings *link_ksettings,
+	const struct ethtool_cmd *legacy_settings)
+{
+	bool retval = true;
+
+	memset(link_ksettings, 0, sizeof(*link_ksettings));
+
+	/* This is used to tell users that driver is still using these
+	 * deprecated legacy fields, and they should not use
+	 * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
+	 */
+	if (legacy_settings->transceiver ||
+	    legacy_settings->maxtxpkt ||
+	    legacy_settings->maxrxpkt)
+		retval = false;
+
+	convert_legacy_u32_to_link_mode(
+		link_ksettings->link_modes.supported,
+		legacy_settings->supported);
+	convert_legacy_u32_to_link_mode(
+		link_ksettings->link_modes.advertising,
+		legacy_settings->advertising);
+	convert_legacy_u32_to_link_mode(
+		link_ksettings->link_modes.lp_advertising,
+		legacy_settings->lp_advertising);
+	link_ksettings->base.speed
+		= ethtool_cmd_speed(legacy_settings);
+	link_ksettings->base.duplex
+		= legacy_settings->duplex;
+	link_ksettings->base.port
+		= legacy_settings->port;
+	link_ksettings->base.phy_address
+		= legacy_settings->phy_address;
+	link_ksettings->base.autoneg
+		= legacy_settings->autoneg;
+	link_ksettings->base.mdio_support
+		= legacy_settings->mdio_support;
+	link_ksettings->base.eth_tp_mdix
+		= legacy_settings->eth_tp_mdix;
+	link_ksettings->base.eth_tp_mdix_ctrl
+		= legacy_settings->eth_tp_mdix_ctrl;
+	return retval;
+}
+
+/* return false if ksettings link modes had higher bits
+ * set. legacy_settings always updated (best effort)
+ */
+static bool
+convert_link_ksettings_to_legacy_settings(
+	struct ethtool_cmd *legacy_settings,
+	const struct ethtool_link_ksettings *link_ksettings)
+{
+	bool retval = true;
+
+	memset(legacy_settings, 0, sizeof(*legacy_settings));
+	/* this also clears the deprecated fields in legacy structure:
+	 * __u8		transceiver;
+	 * __u32	maxtxpkt;
+	 * __u32	maxrxpkt;
+	 */
+
+	retval &= convert_link_mode_to_legacy_u32(
+		&legacy_settings->supported,
+		link_ksettings->link_modes.supported);
+	retval &= convert_link_mode_to_legacy_u32(
+		&legacy_settings->advertising,
+		link_ksettings->link_modes.advertising);
+	retval &= convert_link_mode_to_legacy_u32(
+		&legacy_settings->lp_advertising,
+		link_ksettings->link_modes.lp_advertising);
+	ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed);
+	legacy_settings->duplex
+		= link_ksettings->base.duplex;
+	legacy_settings->port
+		= link_ksettings->base.port;
+	legacy_settings->phy_address
+		= link_ksettings->base.phy_address;
+	legacy_settings->autoneg
+		= link_ksettings->base.autoneg;
+	legacy_settings->mdio_support
+		= link_ksettings->base.mdio_support;
+	legacy_settings->eth_tp_mdix
+		= link_ksettings->base.eth_tp_mdix;
+	legacy_settings->eth_tp_mdix_ctrl
+		= link_ksettings->base.eth_tp_mdix_ctrl;
+	return retval;
+}
+
+/* number of 32-bit words to store the user's link mode bitmaps */
+#define __ETHTOOL_LINK_MODE_MASK_NU32			\
+	DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32)
+
+/* layout of the struct passed from/to userland */
+struct ethtool_link_usettings {
+	struct ethtool_link_settings base;
+	struct {
+		__u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32];
+		__u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+		__u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+	} link_modes;
+};
+
+/* Internal kernel helper to query a device ethtool_link_settings.
+ *
+ * Backward compatibility note: for compatibility with legacy drivers
+ * that implement only the ethtool_cmd API, this has to work with both
+ * drivers implementing get_link_ksettings API and drivers
+ * implementing get_settings API. When drivers implement get_settings
+ * and report ethtool_cmd deprecated fields
+ * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored
+ * because the resulting struct ethtool_link_settings does not report them.
+ */
+int __ethtool_get_link_ksettings(struct net_device *dev,
+				 struct ethtool_link_ksettings *link_ksettings)
+{
+	int err;
+	struct ethtool_cmd cmd;
+
+	ASSERT_RTNL();
+
+	if (dev->ethtool_ops->get_link_ksettings) {
+		memset(link_ksettings, 0, sizeof(*link_ksettings));
+		return dev->ethtool_ops->get_link_ksettings(dev,
+							    link_ksettings);
+	}
+
+	/* driver doesn't support %ethtool_link_ksettings API. revert to
+	 * legacy %ethtool_cmd API, unless it's not supported either.
+	 * TODO: remove when ethtool_ops::get_settings disappears internally
+	 */
+	err = __ethtool_get_settings(dev, &cmd);
+	if (err < 0)
+		return err;
+
+	/* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt
+	 */
+	convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd);
+	return err;
+}
+EXPORT_SYMBOL(__ethtool_get_link_ksettings);
+
+/* convert ethtool_link_usettings in user space to a kernel internal
+ * ethtool_link_ksettings. return 0 on success, errno on error.
+ */
+static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to,
+					 const void __user *from)
+{
+	struct ethtool_link_usettings link_usettings;
+
+	if (copy_from_user(&link_usettings, from, sizeof(link_usettings)))
+		return -EFAULT;
+
+	memcpy(&to->base, &link_usettings.base, sizeof(to->base));
+	bitmap_from_u32array(to->link_modes.supported,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS,
+			     link_usettings.link_modes.supported,
+			     __ETHTOOL_LINK_MODE_MASK_NU32);
+	bitmap_from_u32array(to->link_modes.advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS,
+			     link_usettings.link_modes.advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NU32);
+	bitmap_from_u32array(to->link_modes.lp_advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NBITS,
+			     link_usettings.link_modes.lp_advertising,
+			     __ETHTOOL_LINK_MODE_MASK_NU32);
+
+	return 0;
+}
+
+/* convert a kernel internal ethtool_link_ksettings to
+ * ethtool_link_usettings in user space. return 0 on success, errno on
+ * error.
+ */
+static int
+store_link_ksettings_for_user(void __user *to,
+			      const struct ethtool_link_ksettings *from)
+{
+	struct ethtool_link_usettings link_usettings;
+
+	memcpy(&link_usettings.base, &from->base, sizeof(link_usettings));
+	bitmap_to_u32array(link_usettings.link_modes.supported,
+			   __ETHTOOL_LINK_MODE_MASK_NU32,
+			   from->link_modes.supported,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_to_u32array(link_usettings.link_modes.advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NU32,
+			   from->link_modes.advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS);
+	bitmap_to_u32array(link_usettings.link_modes.lp_advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NU32,
+			   from->link_modes.lp_advertising,
+			   __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+	if (copy_to_user(to, &link_usettings, sizeof(link_usettings)))
+		return -EFAULT;
+
+	return 0;
+}
+
+/* Query device for its ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::get_link_ksettings, even if legacy
+ * ethtool_ops::get_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_GSET for
+ * this driver, so that they can correctly access the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_get_link_ksettings(struct net_device *dev,
+				      void __user *useraddr)
+{
+	int err = 0;
+	struct ethtool_link_ksettings link_ksettings;
+
+	ASSERT_RTNL();
+
+	if (!dev->ethtool_ops->get_link_ksettings)
+		return -EOPNOTSUPP;
+
+	/* handle bitmap nbits handshake */
+	if (copy_from_user(&link_ksettings.base, useraddr,
+			   sizeof(link_ksettings.base)))
+		return -EFAULT;
+
+	if (__ETHTOOL_LINK_MODE_MASK_NU32
+	    != link_ksettings.base.link_mode_masks_nwords) {
+		/* wrong link mode nbits requested */
+		memset(&link_ksettings, 0, sizeof(link_ksettings));
+		/* keep cmd field reset to 0 */
+		/* send back number of words required as negative val */
+		compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX,
+				   "need too many bits for link modes!");
+		link_ksettings.base.link_mode_masks_nwords
+			= -((s8)__ETHTOOL_LINK_MODE_MASK_NU32);
+
+		/* copy the base fields back to user, not the link
+		 * mode bitmaps
+		 */
+		if (copy_to_user(useraddr, &link_ksettings.base,
+				 sizeof(link_ksettings.base)))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	/* handshake successful: user/kernel agree on
+	 * link_mode_masks_nwords
+	 */
+
+	memset(&link_ksettings, 0, sizeof(link_ksettings));
+	err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
+	if (err < 0)
+		return err;
+
+	/* make sure we tell the right values to user */
+	link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+	link_ksettings.base.link_mode_masks_nwords
+		= __ETHTOOL_LINK_MODE_MASK_NU32;
+
+	return store_link_ksettings_for_user(useraddr, &link_ksettings);
+}
+
+/* Update device ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::set_link_ksettings, even if legacy
+ * ethtool_ops::set_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_SSET for
+ * this driver, so that they can correctly update the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_set_link_ksettings(struct net_device *dev,
+				      void __user *useraddr)
+{
+	int err;
+	struct ethtool_link_ksettings link_ksettings;
+
+	ASSERT_RTNL();
+
+	if (!dev->ethtool_ops->set_link_ksettings)
+		return -EOPNOTSUPP;
+
+	/* make sure nbits field has expected value */
+	if (copy_from_user(&link_ksettings.base, useraddr,
+			   sizeof(link_ksettings.base)))
+		return -EFAULT;
+
+	if (__ETHTOOL_LINK_MODE_MASK_NU32
+	    != link_ksettings.base.link_mode_masks_nwords)
+		return -EINVAL;
+
+	/* copy the whole structure, now that we know it has expected
+	 * format
+	 */
+	err = load_link_ksettings_from_user(&link_ksettings, useraddr);
+	if (err)
+		return err;
+
+	/* re-check nwords field, just in case */
+	if (__ETHTOOL_LINK_MODE_MASK_NU32
+	    != link_ksettings.base.link_mode_masks_nwords)
+		return -EINVAL;
+
+	return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+}
+
+/* Internal kernel helper to query a device ethtool_cmd settings.
+ *
+ * Note about transition to ethtool_link_settings API: We do not need
+ * (or want) this function to support "dev" instances that implement
+ * the ethtool_link_settings API as we will update the drivers calling
+ * this function to call __ethtool_get_link_ksettings instead, before
+ * the first drivers implement ethtool_ops::get_link_ksettings.
+ *
+ * TODO 1: at least make this function static when no driver is using it
+ * TODO 2: remove when ethtool_ops::get_settings disappears internally
+ */
 int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
 	ASSERT_RTNL();
@@ -400,30 +753,112 @@ int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 }
 EXPORT_SYMBOL(__ethtool_get_settings);
 
+static void
+warn_incomplete_ethtool_legacy_settings_conversion(const char *details)
+{
+	char name[sizeof(current->comm)];
+
+	pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n",
+		     get_task_comm(name, current), details);
+}
+
+/* Query device for its ethtool_cmd settings.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing get_link_ksettings
+ * API and drivers implementing get_settings API. When drivers
+ * implement get_link_ksettings and report higher link mode bits, a
+ * kernel warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, but the command is successful
+ * (only the lower link mode bits reported back to user).
+ */
 static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
 {
-	int err;
 	struct ethtool_cmd cmd;
 
-	err = __ethtool_get_settings(dev, &cmd);
-	if (err < 0)
-		return err;
+	ASSERT_RTNL();
+
+	if (dev->ethtool_ops->get_link_ksettings) {
+		/* First, use link_ksettings API if it is supported */
+		int err;
+		struct ethtool_link_ksettings link_ksettings;
+
+		memset(&link_ksettings, 0, sizeof(link_ksettings));
+		err = dev->ethtool_ops->get_link_ksettings(dev,
+							   &link_ksettings);
+		if (err < 0)
+			return err;
+		if (!convert_link_ksettings_to_legacy_settings(&cmd,
+							       &link_ksettings))
+			warn_incomplete_ethtool_legacy_settings_conversion(
+				"link modes are only partially reported");
+
+		/* send a sensible cmd tag back to user */
+		cmd.cmd = ETHTOOL_GSET;
+	} else {
+		int err;
+		/* TODO: return -EOPNOTSUPP when
+		 * ethtool_ops::get_settings disappears internally
+		 */
+
+		/* driver doesn't support %ethtool_link_ksettings
+		 * API. revert to legacy %ethtool_cmd API, unless it's
+		 * not supported either.
+		 */
+		err = __ethtool_get_settings(dev, &cmd);
+		if (err < 0)
+			return err;
+	}
 
 	if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
 		return -EFAULT;
+
 	return 0;
 }
 
+/* Update device link settings with given ethtool_cmd.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing set_link_ksettings
+ * API and drivers implementing set_settings API. When drivers
+ * implement set_link_ksettings and user's request updates deprecated
+ * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
+ * warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, and the request is rejected.
+ */
 static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_cmd cmd;
 
-	if (!dev->ethtool_ops->set_settings)
-		return -EOPNOTSUPP;
+	ASSERT_RTNL();
 
 	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
 		return -EFAULT;
 
+	/* first, try new %ethtool_link_ksettings API. */
+	if (dev->ethtool_ops->set_link_ksettings) {
+		struct ethtool_link_ksettings link_ksettings;
+
+		if (!convert_legacy_settings_to_link_ksettings(&link_ksettings,
+							       &cmd))
+			return -EINVAL;
+
+		link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS;
+		link_ksettings.base.link_mode_masks_nwords
+			= __ETHTOOL_LINK_MODE_MASK_NU32;
+		return dev->ethtool_ops->set_link_ksettings(dev,
+							    &link_ksettings);
+	}
+
+	/* legacy %ethtool_cmd API */
+
+	/* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings
+	 * disappears internally
+	 */
+
+	if (!dev->ethtool_ops->set_settings)
+		return -EOPNOTSUPP;
+
 	return dev->ethtool_ops->set_settings(dev, &cmd);
 }
 
@@ -2252,6 +2687,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_PERQUEUE:
 		rc = ethtool_set_per_queue(dev, useraddr);
 		break;
+	case ETHTOOL_GLINKSETTINGS:
+		rc = ethtool_get_link_ksettings(dev, useraddr);
+		break;
+	case ETHTOOL_SLINKSETTINGS:
+		rc = ethtool_set_link_ksettings(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From 5af557a6d2a077a8678879cf9498fbf8d5d81697 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Thu, 11 Feb 2016 21:41:18 -0200
Subject: [media] uapi/media.h: Declare interface types for ALSA

Declare the interface types to be used on alsa for
the new G_TOPOLOGY ioctl.

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Acked-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 drivers/media/media-entity.c | 16 ++++++++++++++++
 include/uapi/linux/media.h   | 10 ++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/media-entity.c b/drivers/media/media-entity.c
index 48d87c6f5e90..33628b8cba7e 100644
--- a/drivers/media/media-entity.c
+++ b/drivers/media/media-entity.c
@@ -65,6 +65,22 @@ static inline const char *intf_type(struct media_interface *intf)
 		return "v4l2-subdev";
 	case MEDIA_INTF_T_V4L_SWRADIO:
 		return "swradio";
+	case MEDIA_INTF_T_ALSA_PCM_CAPTURE:
+		return "pcm-capture";
+	case MEDIA_INTF_T_ALSA_PCM_PLAYBACK:
+		return "pcm-playback";
+	case MEDIA_INTF_T_ALSA_CONTROL:
+		return "alsa-control";
+	case MEDIA_INTF_T_ALSA_COMPRESS:
+		return "compress";
+	case MEDIA_INTF_T_ALSA_RAWMIDI:
+		return "rawmidi";
+	case MEDIA_INTF_T_ALSA_HWDEP:
+		return "hwdep";
+	case MEDIA_INTF_T_ALSA_SEQUENCER:
+		return "sequencer";
+	case MEDIA_INTF_T_ALSA_TIMER:
+		return "timer";
 	default:
 		return "unknown-intf";
 	}
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 6aac2f035a5d..f53ede3efa2f 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -263,6 +263,7 @@ struct media_links_enum {
 
 #define MEDIA_INTF_T_DVB_BASE	0x00000100
 #define MEDIA_INTF_T_V4L_BASE	0x00000200
+#define MEDIA_INTF_T_ALSA_BASE	0x00000300
 
 /* Interface types */
 
@@ -278,6 +279,15 @@ struct media_links_enum {
 #define MEDIA_INTF_T_V4L_SUBDEV (MEDIA_INTF_T_V4L_BASE + 3)
 #define MEDIA_INTF_T_V4L_SWRADIO (MEDIA_INTF_T_V4L_BASE + 4)
 
+#define MEDIA_INTF_T_ALSA_PCM_CAPTURE   (MEDIA_INTF_T_ALSA_BASE)
+#define MEDIA_INTF_T_ALSA_PCM_PLAYBACK  (MEDIA_INTF_T_ALSA_BASE + 1)
+#define MEDIA_INTF_T_ALSA_CONTROL       (MEDIA_INTF_T_ALSA_BASE + 2)
+#define MEDIA_INTF_T_ALSA_COMPRESS      (MEDIA_INTF_T_ALSA_BASE + 3)
+#define MEDIA_INTF_T_ALSA_RAWMIDI       (MEDIA_INTF_T_ALSA_BASE + 4)
+#define MEDIA_INTF_T_ALSA_HWDEP         (MEDIA_INTF_T_ALSA_BASE + 5)
+#define MEDIA_INTF_T_ALSA_SEQUENCER     (MEDIA_INTF_T_ALSA_BASE + 6)
+#define MEDIA_INTF_T_ALSA_TIMER         (MEDIA_INTF_T_ALSA_BASE + 7)
+
 /*
  * MC next gen API definitions
  *
-- 
cgit v1.2.3


From 7a2eba12ff071dbc8e84ff4c2208d9f8bbb0046a Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuahkh@osg.samsung.com>
Date: Thu, 11 Feb 2016 21:41:20 -0200
Subject: [media] media: Add ALSA Media Controller function entities

Add ALSA Media Controller capture, playback, and mixer
function entity defines.

[mchehab@osg.samsung.com: fix a trivial merge conflict and start
 MEDIA_ENT_AUDIO_F from 3000]

Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
Acked-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 include/uapi/linux/media.h | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index f53ede3efa2f..95e126edb1c3 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -87,6 +87,13 @@ struct media_device_info {
 #define MEDIA_ENT_F_IF_VID_DECODER	(MEDIA_ENT_F_BASE + 2001)
 #define MEDIA_ENT_F_IF_AUD_DECODER	(MEDIA_ENT_F_BASE + 2002)
 
+/*
+ * Audio Entity Functions
+ */
+#define MEDIA_ENT_F_AUDIO_CAPTURE	(MEDIA_ENT_F_BASE + 3000)
+#define MEDIA_ENT_F_AUDIO_PLAYBACK	(MEDIA_ENT_F_BASE + 3001)
+#define MEDIA_ENT_F_AUDIO_MIXER		(MEDIA_ENT_F_BASE + 3002)
+
 /*
  * Connectors
  */
-- 
cgit v1.2.3


From b07edbe1cf3dae9ba81f24888e2f2a9dbe778918 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 16 Feb 2016 17:24:08 +0100
Subject: netfilter: meta: add PRANDOM support

Can be used to randomly match packets e.g. for statistic traffic sampling.

See commit 3ad0040573b0c00f8848
("bpf: split state from prandom_u32() and consolidate {c, e}BPF prngs")
for more info why this doesn't use prandom_u32 directly.

Unlike bpf nft_meta can be built as a module, so add an EXPORT_SYMBOL
for prandom_seed_full_state too.

Cc: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 lib/random32.c                           |  1 +
 net/netfilter/nft_meta.c                 | 11 +++++++++++
 3 files changed, 14 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index be41ffc128b8..b19be0a098c0 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -681,6 +681,7 @@ enum nft_exthdr_attributes {
  * @NFT_META_IIFGROUP: packet input interface group
  * @NFT_META_OIFGROUP: packet output interface group
  * @NFT_META_CGROUP: socket control group (skb->sk->sk_classid)
+ * @NFT_META_PRANDOM: a 32bit pseudo-random number
  */
 enum nft_meta_keys {
 	NFT_META_LEN,
@@ -707,6 +708,7 @@ enum nft_meta_keys {
 	NFT_META_IIFGROUP,
 	NFT_META_OIFGROUP,
 	NFT_META_CGROUP,
+	NFT_META_PRANDOM,
 };
 
 /**
diff --git a/lib/random32.c b/lib/random32.c
index 12111910ccd0..510d1ce7d4d2 100644
--- a/lib/random32.c
+++ b/lib/random32.c
@@ -255,6 +255,7 @@ void prandom_seed_full_state(struct rnd_state __percpu *pcpu_state)
 		prandom_warmup(state);
 	}
 }
+EXPORT_SYMBOL(prandom_seed_full_state);
 
 /*
  *	Generate better values after random number generator
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index fe885bf271c5..16c50b0dd426 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -28,6 +28,8 @@
 
 #include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
 
+static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
+
 void nft_meta_get_eval(const struct nft_expr *expr,
 		       struct nft_regs *regs,
 		       const struct nft_pktinfo *pkt)
@@ -181,6 +183,11 @@ void nft_meta_get_eval(const struct nft_expr *expr,
 		*dest = sock_cgroup_classid(&sk->sk_cgrp_data);
 		break;
 #endif
+	case NFT_META_PRANDOM: {
+		struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
+		*dest = prandom_u32_state(state);
+		break;
+	}
 	default:
 		WARN_ON(1);
 		goto err;
@@ -277,6 +284,10 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
 	case NFT_META_OIFNAME:
 		len = IFNAMSIZ;
 		break;
+	case NFT_META_PRANDOM:
+		prandom_init_once(&nft_prandom_state);
+		len = sizeof(u32);
+		break;
 	default:
 		return -EOPNOTSUPP;
 	}
-- 
cgit v1.2.3


From 808e738142e7086ef793ebf9797099c392894e65 Mon Sep 17 00:00:00 2001
From: Shannon Zhao <shannon.zhao@linaro.org>
Date: Mon, 11 Jan 2016 22:46:15 +0800
Subject: arm64: KVM: Add a new feature bit for PMUv3

To support guest PMUv3, use one bit of the VCPU INIT feature array.
Initialize the PMU when initialzing the vcpu with that bit and PMU
overflow interrupt set.

Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
Acked-by: Peter Maydell <peter.maydell@linaro.org>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 Documentation/virtual/kvm/api.txt |  2 ++
 arch/arm64/include/asm/kvm_host.h |  2 +-
 arch/arm64/include/uapi/asm/kvm.h |  1 +
 arch/arm64/kvm/reset.c            |  3 +++
 include/kvm/arm_pmu.h             |  2 ++
 include/uapi/linux/kvm.h          |  1 +
 virt/kvm/arm/pmu.c                | 10 ++++++++++
 7 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 07e4cdf02407..9684f8dc6bb2 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2577,6 +2577,8 @@ Possible features:
 	  Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only).
 	- KVM_ARM_VCPU_PSCI_0_2: Emulate PSCI v0.2 for the CPU.
 	  Depends on KVM_CAP_ARM_PSCI_0_2.
+	- KVM_ARM_VCPU_PMU_V3: Emulate PMUv3 for the CPU.
+	  Depends on KVM_CAP_ARM_PMU_V3.
 
 
 4.83 KVM_ARM_PREFERRED_TARGET
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index a819c6debce4..b02ef0828f22 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -42,7 +42,7 @@
 
 #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
 
-#define KVM_VCPU_MAX_FEATURES 3
+#define KVM_VCPU_MAX_FEATURES 4
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 2d4ca4bb0dd3..6aedbe314432 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -94,6 +94,7 @@ struct kvm_regs {
 #define KVM_ARM_VCPU_POWER_OFF		0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_EL1_32BIT		1 /* CPU running a 32bit VM */
 #define KVM_ARM_VCPU_PSCI_0_2		2 /* CPU uses PSCI v0.2 */
+#define KVM_ARM_VCPU_PMU_V3		3 /* Support guest PMUv3 */
 
 struct kvm_vcpu_init {
 	__u32 target;
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index dfbce781d284..cf4f28a7a514 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -77,6 +77,9 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_GUEST_DEBUG_HW_WPS:
 		r = get_num_wrps();
 		break;
+	case KVM_CAP_ARM_PMU_V3:
+		r = kvm_arm_support_pmu_v3();
+		break;
 	case KVM_CAP_SET_GUEST_DEBUG:
 		r = 1;
 		break;
diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
index 9f87d717ef84..ee62497d46f7 100644
--- a/include/kvm/arm_pmu.h
+++ b/include/kvm/arm_pmu.h
@@ -53,6 +53,7 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val);
 void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val);
 void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
 				    u64 select_idx);
+bool kvm_arm_support_pmu_v3(void);
 #else
 struct kvm_pmu {
 };
@@ -80,6 +81,7 @@ static inline void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {}
 static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {}
 static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu,
 						  u64 data, u64 select_idx) {}
+static inline bool kvm_arm_support_pmu_v3(void) { return false; }
 #endif
 
 #endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 9da905157cee..dc16d3084d4a 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -850,6 +850,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
 #define KVM_CAP_HYPERV_SYNIC 123
 #define KVM_CAP_S390_RI 124
+#define KVM_CAP_ARM_PMU_V3 125
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 9b83857da195..6e28f4f86cc6 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -405,3 +405,13 @@ void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
 
 	pmc->perf_event = event;
 }
+
+bool kvm_arm_support_pmu_v3(void)
+{
+	/*
+	 * Check if HW_PERF_EVENTS are supported by checking the number of
+	 * hardware performance counters. This could ensure the presence of
+	 * a physical PMU and CONFIG_PERF_EVENT is selected.
+	 */
+	return (perf_num_counters() > 0);
+}
-- 
cgit v1.2.3


From f577f6c2a6a5ccabe98061f256a1e2ff468d5e93 Mon Sep 17 00:00:00 2001
From: Shannon Zhao <shannon.zhao@linaro.org>
Date: Mon, 11 Jan 2016 20:56:17 +0800
Subject: arm64: KVM: Introduce per-vcpu kvm device controls

In some cases it needs to get/set attributes specific to a vcpu and so
needs something else than ONE_REG.

Let's copy the KVM_DEVICE approach, and define the respective ioctls
for the vcpu file descriptor.

Signed-off-by: Shannon Zhao <shannon.zhao@linaro.org>
Reviewed-by: Andrew Jones <drjones@redhat.com>
Acked-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 Documentation/virtual/kvm/api.txt          | 10 +++---
 Documentation/virtual/kvm/devices/vcpu.txt |  8 +++++
 arch/arm/kvm/arm.c                         | 55 ++++++++++++++++++++++++++++++
 arch/arm64/kvm/reset.c                     |  1 +
 include/uapi/linux/kvm.h                   |  1 +
 5 files changed, 71 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/virtual/kvm/devices/vcpu.txt

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 9684f8dc6bb2..cb2ef0bcdcb5 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2507,8 +2507,9 @@ struct kvm_create_device {
 
 4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
 
-Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device
-Type: device ioctl, vm ioctl
+Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
+  KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+Type: device ioctl, vm ioctl, vcpu ioctl
 Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
 Errors:
@@ -2533,8 +2534,9 @@ struct kvm_device_attr {
 
 4.81 KVM_HAS_DEVICE_ATTR
 
-Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device
-Type: device ioctl, vm ioctl
+Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
+  KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+Type: device ioctl, vm ioctl, vcpu ioctl
 Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
 Errors:
diff --git a/Documentation/virtual/kvm/devices/vcpu.txt b/Documentation/virtual/kvm/devices/vcpu.txt
new file mode 100644
index 000000000000..3cc59c5e44ce
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/vcpu.txt
@@ -0,0 +1,8 @@
+Generic vcpu interface
+====================================
+
+The virtual cpu "device" also accepts the ioctls KVM_SET_DEVICE_ATTR,
+KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same struct
+kvm_device_attr as other devices, but targets VCPU-wide settings and controls.
+
+The groups and attributes per virtual cpu, if any, are architecture specific.
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9d133df2da53..166232356291 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -828,11 +828,51 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
+				 struct kvm_device_attr *attr)
+{
+	int ret = -ENXIO;
+
+	switch (attr->group) {
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
+				 struct kvm_device_attr *attr)
+{
+	int ret = -ENXIO;
+
+	switch (attr->group) {
+	default:
+		break;
+	}
+
+	return ret;
+}
+
+static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
+				 struct kvm_device_attr *attr)
+{
+	int ret = -ENXIO;
+
+	switch (attr->group) {
+	default:
+		break;
+	}
+
+	return ret;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
+	struct kvm_device_attr attr;
 
 	switch (ioctl) {
 	case KVM_ARM_VCPU_INIT: {
@@ -875,6 +915,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 			return -E2BIG;
 		return kvm_arm_copy_reg_indices(vcpu, user_list->reg);
 	}
+	case KVM_SET_DEVICE_ATTR: {
+		if (copy_from_user(&attr, argp, sizeof(attr)))
+			return -EFAULT;
+		return kvm_arm_vcpu_set_attr(vcpu, &attr);
+	}
+	case KVM_GET_DEVICE_ATTR: {
+		if (copy_from_user(&attr, argp, sizeof(attr)))
+			return -EFAULT;
+		return kvm_arm_vcpu_get_attr(vcpu, &attr);
+	}
+	case KVM_HAS_DEVICE_ATTR: {
+		if (copy_from_user(&attr, argp, sizeof(attr)))
+			return -EFAULT;
+		return kvm_arm_vcpu_has_attr(vcpu, &attr);
+	}
 	default:
 		return -EINVAL;
 	}
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index cf4f28a7a514..9677bf069bcc 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -81,6 +81,7 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
 		r = kvm_arm_support_pmu_v3();
 		break;
 	case KVM_CAP_SET_GUEST_DEBUG:
+	case KVM_CAP_VCPU_ATTRIBUTES:
 		r = 1;
 		break;
 	default:
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index dc16d3084d4a..50f44a229212 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -851,6 +851,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_HYPERV_SYNIC 123
 #define KVM_CAP_S390_RI 124
 #define KVM_CAP_ARM_PMU_V3 125
+#define KVM_CAP_VCPU_ATTRIBUTES 126
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 9e8ce79cd711d4dfe09d8bba6822cd9bb7db96bd Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 26 Feb 2016 07:54:39 -0800
Subject: net: sched: cls_u32 add bit to specify software only rules

In the initial implementation the only way to stop a rule from being
inserted into the hardware table was via the device feature flag.
However this doesn't work well when working on an end host system
where packets are expect to hit both the hardware and software
datapaths.

For example we can imagine a rule that will match an IP address and
increment a field. If we install this rule in both hardware and
software we may increment the field twice. To date we have only
added support for the drop action so we have been able to ignore
these cases. But as we extend the action support we will hit this
example plus more such cases. Arguably these are not even corner
cases in many working systems these cases will be common.

To avoid forcing the driver to always abort (i.e. the above example)
this patch adds a flag to add a rule in software only. A careful
user can use this flag to build software and hardware datapaths
that work together. One example we have found particularly useful
is to use hardware resources to set the skb->mark on the skb when
the match may be expensive to run in software but a mark lookup
in a hash table is cheap. The idea here is hardware can do in one
lookup what the u32 classifier may need to traverse multiple lists
and hash tables to compute. The flag is only passed down on inserts.
On deletion to avoid stale references in hardware we always try
to remove a rule if it exists.

The flags field is part of the classifier specific options. Although
it is tempting to lift this into the generic structure doing this
proves difficult do to how the tc netlink attributes are implemented
along with how the dump/change routines are called. There is also
precedence for putting seemingly generic pieces in the specific
classifier options such as TCA_U32_POLICE, TCA_U32_ACT, etc. So
although not ideal I've left FLAGS in the u32 options as well as it
simplifies the code greatly and user space has already learned how
to manage these bits ala 'tc' tool.

Another thing if trying to update a rule we require the flags to
be unchanged. This is to force user space, software u32 and
the hardware u32 to keep in sync. Thanks to Simon Horman for
catching this case.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/pkt_cls.h        | 13 +++++++++++--
 include/uapi/linux/pkt_cls.h |  1 +
 net/sched/cls_u32.c          | 37 +++++++++++++++++++++++++++----------
 3 files changed, 39 insertions(+), 12 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index 6096e96fb78b..bea14eee373e 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -392,12 +392,21 @@ struct tc_cls_u32_offload {
 	};
 };
 
-static inline bool tc_should_offload(struct net_device *dev)
+/* tca flags definitions */
+#define TCA_CLS_FLAGS_SKIP_HW 1
+
+static inline bool tc_should_offload(struct net_device *dev, u32 flags)
 {
 	if (!(dev->features & NETIF_F_HW_TC))
 		return false;
 
-	return dev->netdev_ops->ndo_setup_tc;
+	if (flags & TCA_CLS_FLAGS_SKIP_HW)
+		return false;
+
+	if (!dev->netdev_ops->ndo_setup_tc)
+		return false;
+
+	return true;
 }
 
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 439873775d49..9874f5680926 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -172,6 +172,7 @@ enum {
 	TCA_U32_INDEV,
 	TCA_U32_PCNT,
 	TCA_U32_MARK,
+	TCA_U32_FLAGS,
 	__TCA_U32_MAX
 };
 
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 24e888b9b728..563cdad76448 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -59,6 +59,7 @@ struct tc_u_knode {
 #ifdef CONFIG_CLS_U32_PERF
 	struct tc_u32_pcnt __percpu *pf;
 #endif
+	u32			flags;
 #ifdef CONFIG_CLS_U32_MARK
 	u32			val;
 	u32			mask;
@@ -434,7 +435,7 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, 0)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
 		offload.cls_u32->knode.handle = handle;
 		dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
@@ -442,7 +443,9 @@ static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
 	}
 }
 
-static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
+static void u32_replace_hw_hnode(struct tcf_proto *tp,
+				 struct tc_u_hnode *h,
+				 u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
@@ -451,7 +454,7 @@ static void u32_replace_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, flags)) {
 		offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -471,7 +474,7 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, 0)) {
 		offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
 		offload.cls_u32->hnode.divisor = h->divisor;
 		offload.cls_u32->hnode.handle = h->handle;
@@ -482,7 +485,9 @@ static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
 	}
 }
 
-static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
+static void u32_replace_hw_knode(struct tcf_proto *tp,
+				 struct tc_u_knode *n,
+				 u32 flags)
 {
 	struct net_device *dev = tp->q->dev_queue->dev;
 	struct tc_cls_u32_offload u32_offload = {0};
@@ -491,7 +496,7 @@ static void u32_replace_hw_knode(struct tcf_proto *tp, struct tc_u_knode *n)
 	offload.type = TC_SETUP_CLSU32;
 	offload.cls_u32 = &u32_offload;
 
-	if (tc_should_offload(dev)) {
+	if (tc_should_offload(dev, flags)) {
 		offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
 		offload.cls_u32->knode.handle = n->handle;
 		offload.cls_u32->knode.fshift = n->fshift;
@@ -679,6 +684,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
 	[TCA_U32_SEL]		= { .len = sizeof(struct tc_u32_sel) },
 	[TCA_U32_INDEV]		= { .type = NLA_STRING, .len = IFNAMSIZ },
 	[TCA_U32_MARK]		= { .len = sizeof(struct tc_u32_mark) },
+	[TCA_U32_FLAGS]		= { .type = NLA_U32 },
 };
 
 static int u32_set_parms(struct net *net, struct tcf_proto *tp,
@@ -786,6 +792,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
 #endif
 	new->fshift = n->fshift;
 	new->res = n->res;
+	new->flags = n->flags;
 	RCU_INIT_POINTER(new->ht_down, n->ht_down);
 
 	/* bump reference count as long as we hold pointer to structure */
@@ -825,7 +832,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	struct tc_u32_sel *s;
 	struct nlattr *opt = tca[TCA_OPTIONS];
 	struct nlattr *tb[TCA_U32_MAX + 1];
-	u32 htid;
+	u32 htid, flags = 0;
 	int err;
 #ifdef CONFIG_CLS_U32_PERF
 	size_t size;
@@ -838,6 +845,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	if (err < 0)
 		return err;
 
+	if (tb[TCA_U32_FLAGS])
+		flags = nla_get_u32(tb[TCA_U32_FLAGS]);
+
 	n = (struct tc_u_knode *)*arg;
 	if (n) {
 		struct tc_u_knode *new;
@@ -845,6 +855,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		if (TC_U32_KEY(n->handle) == 0)
 			return -EINVAL;
 
+		if (n->flags != flags)
+			return -EINVAL;
+
 		new = u32_init_knode(tp, n);
 		if (!new)
 			return -ENOMEM;
@@ -861,7 +874,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		u32_replace_knode(tp, tp_c, new);
 		tcf_unbind_filter(tp, &n->res);
 		call_rcu(&n->rcu, u32_delete_key_rcu);
-		u32_replace_hw_knode(tp, new);
+		u32_replace_hw_knode(tp, new, flags);
 		return 0;
 	}
 
@@ -889,7 +902,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 		rcu_assign_pointer(tp_c->hlist, ht);
 		*arg = (unsigned long)ht;
 
-		u32_replace_hw_hnode(tp, ht);
+		u32_replace_hw_hnode(tp, ht, flags);
 		return 0;
 	}
 
@@ -940,6 +953,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 	RCU_INIT_POINTER(n->ht_up, ht);
 	n->handle = handle;
 	n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
+	n->flags = flags;
 	tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
 	n->tp = tp;
 
@@ -972,7 +986,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
 
 		RCU_INIT_POINTER(n->next, pins);
 		rcu_assign_pointer(*ins, n);
-		u32_replace_hw_knode(tp, n);
+		u32_replace_hw_knode(tp, n, flags);
 		*arg = (unsigned long)n;
 		return 0;
 	}
@@ -1077,6 +1091,9 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
 		    nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
 			goto nla_put_failure;
 
+		if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags))
+			goto nla_put_failure;
+
 #ifdef CONFIG_CLS_U32_MARK
 		if ((n->val || n->mask)) {
 			struct tc_u32_mark mark = {.val = n->val,
-- 
cgit v1.2.3


From bfcd3a46617209454cfc0947ab093e37fd1e84ef Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@mellanox.com>
Date: Fri, 26 Feb 2016 17:32:23 +0100
Subject: Introduce devlink infrastructure

Introduce devlink infrastructure for drivers to register and expose to
userspace via generic Netlink interface.

There are two basic objects defined:
devlink - one instance for every "parent device", for example switch ASIC
devlink port - one instance for every physical port of the device.

This initial portion implements basic get/dump of objects to userspace.
Also, port splitter and port type setting is implemented.

Signed-off-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS                  |   8 +
 include/net/devlink.h        | 140 ++++++++
 include/uapi/linux/devlink.h |  72 +++++
 net/Kconfig                  |   7 +
 net/core/Makefile            |   1 +
 net/core/devlink.c           | 738 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 966 insertions(+)
 create mode 100644 include/net/devlink.h
 create mode 100644 include/uapi/linux/devlink.h
 create mode 100644 net/core/devlink.c

(limited to 'include/uapi/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 12b764f4c93c..e45682745263 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3499,6 +3499,14 @@ F:	include/linux/device-mapper.h
 F:	include/linux/dm-*.h
 F:	include/uapi/linux/dm-*.h
 
+DEVLINK
+M:	Jiri Pirko <jiri@mellanox.com>
+L:	netdev@vger.kernel.org
+S:	Supported
+F:	net/core/devlink.c
+F:	include/net/devlink.h
+F:	include/uapi/linux/devlink.h
+
 DIALOG SEMICONDUCTOR DRIVERS
 M:	Support Opensource <support.opensource@diasemi.com>
 W:	http://www.dialog-semiconductor.com/products
diff --git a/include/net/devlink.h b/include/net/devlink.h
new file mode 100644
index 000000000000..c37d257891d6
--- /dev/null
+++ b/include/net/devlink.h
@@ -0,0 +1,140 @@
+/*
+ * include/net/devlink.h - Network physical device Netlink interface
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#ifndef _NET_DEVLINK_H_
+#define _NET_DEVLINK_H_
+
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <uapi/linux/devlink.h>
+
+struct devlink_ops;
+
+struct devlink {
+	struct list_head list;
+	struct list_head port_list;
+	const struct devlink_ops *ops;
+	struct device *dev;
+	possible_net_t _net;
+	char priv[0] __aligned(NETDEV_ALIGN);
+};
+
+struct devlink_port {
+	struct list_head list;
+	struct devlink *devlink;
+	unsigned index;
+	bool registered;
+	enum devlink_port_type type;
+	enum devlink_port_type desired_type;
+	void *type_dev;
+	bool split;
+	u32 split_group;
+};
+
+struct devlink_ops {
+	size_t priv_size;
+	int (*port_type_set)(struct devlink_port *devlink_port,
+			     enum devlink_port_type port_type);
+	int (*port_split)(struct devlink *devlink, unsigned int port_index,
+			  unsigned int count);
+	int (*port_unsplit)(struct devlink *devlink, unsigned int port_index);
+};
+
+static inline void *devlink_priv(struct devlink *devlink)
+{
+	BUG_ON(!devlink);
+	return &devlink->priv;
+}
+
+static inline struct devlink *priv_to_devlink(void *priv)
+{
+	BUG_ON(!priv);
+	return container_of(priv, struct devlink, priv);
+}
+
+struct ib_device;
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+
+struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size);
+int devlink_register(struct devlink *devlink, struct device *dev);
+void devlink_unregister(struct devlink *devlink);
+void devlink_free(struct devlink *devlink);
+int devlink_port_register(struct devlink *devlink,
+			  struct devlink_port *devlink_port,
+			  unsigned int port_index);
+void devlink_port_unregister(struct devlink_port *devlink_port);
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+			       struct net_device *netdev);
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+			      struct ib_device *ibdev);
+void devlink_port_type_clear(struct devlink_port *devlink_port);
+void devlink_port_split_set(struct devlink_port *devlink_port,
+			    u32 split_group);
+
+#else
+
+static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
+					    size_t priv_size)
+{
+	return kzalloc(sizeof(struct devlink) + priv_size, GFP_KERNEL);
+}
+
+static inline int devlink_register(struct devlink *devlink, struct device *dev)
+{
+	return 0;
+}
+
+static inline void devlink_unregister(struct devlink *devlink)
+{
+}
+
+static inline void devlink_free(struct devlink *devlink)
+{
+	kfree(devlink);
+}
+
+static inline int devlink_port_register(struct devlink *devlink,
+					struct devlink_port *devlink_port,
+					unsigned int port_index)
+{
+	return 0;
+}
+
+static inline void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+}
+
+static inline void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+					     struct net_device *netdev)
+{
+}
+
+static inline void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+					    struct ib_device *ibdev)
+{
+}
+
+static inline void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+}
+
+static inline void devlink_port_split_set(struct devlink_port *devlink_port,
+					  u32 split_group)
+{
+}
+
+#endif
+
+#endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
new file mode 100644
index 000000000000..c9fee5781eb1
--- /dev/null
+++ b/include/uapi/linux/devlink.h
@@ -0,0 +1,72 @@
+/*
+ * include/uapi/linux/devlink.h - Network physical device Netlink interface
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _UAPI_LINUX_DEVLINK_H_
+#define _UAPI_LINUX_DEVLINK_H_
+
+#define DEVLINK_GENL_NAME "devlink"
+#define DEVLINK_GENL_VERSION 0x1
+#define DEVLINK_GENL_MCGRP_CONFIG_NAME "config"
+
+enum devlink_command {
+	/* don't change the order or add anything between, this is ABI! */
+	DEVLINK_CMD_UNSPEC,
+
+	DEVLINK_CMD_GET,		/* can dump */
+	DEVLINK_CMD_SET,
+	DEVLINK_CMD_NEW,
+	DEVLINK_CMD_DEL,
+
+	DEVLINK_CMD_PORT_GET,		/* can dump */
+	DEVLINK_CMD_PORT_SET,
+	DEVLINK_CMD_PORT_NEW,
+	DEVLINK_CMD_PORT_DEL,
+
+	DEVLINK_CMD_PORT_SPLIT,
+	DEVLINK_CMD_PORT_UNSPLIT,
+
+	/* add new commands above here */
+
+	__DEVLINK_CMD_MAX,
+	DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1
+};
+
+enum devlink_port_type {
+	DEVLINK_PORT_TYPE_NOTSET,
+	DEVLINK_PORT_TYPE_AUTO,
+	DEVLINK_PORT_TYPE_ETH,
+	DEVLINK_PORT_TYPE_IB,
+};
+
+enum devlink_attr {
+	/* don't change the order or add anything between, this is ABI! */
+	DEVLINK_ATTR_UNSPEC,
+
+	/* bus name + dev name together are a handle for devlink entity */
+	DEVLINK_ATTR_BUS_NAME,			/* string */
+	DEVLINK_ATTR_DEV_NAME,			/* string */
+
+	DEVLINK_ATTR_PORT_INDEX,		/* u32 */
+	DEVLINK_ATTR_PORT_TYPE,			/* u16 */
+	DEVLINK_ATTR_PORT_DESIRED_TYPE,		/* u16 */
+	DEVLINK_ATTR_PORT_NETDEV_IFINDEX,	/* u32 */
+	DEVLINK_ATTR_PORT_NETDEV_NAME,		/* string */
+	DEVLINK_ATTR_PORT_IBDEV_NAME,		/* string */
+	DEVLINK_ATTR_PORT_SPLIT_COUNT,		/* u32 */
+	DEVLINK_ATTR_PORT_SPLIT_GROUP,		/* u32 */
+
+	/* add new attributes above here, update the policy in devlink.c */
+
+	__DEVLINK_ATTR_MAX,
+	DEVLINK_ATTR_MAX = __DEVLINK_ATTR_MAX - 1
+};
+
+#endif /* _UAPI_LINUX_DEVLINK_H_ */
diff --git a/net/Kconfig b/net/Kconfig
index b80efecfc1a0..6c9cfb0d7639 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -396,6 +396,13 @@ config DST_CACHE
 	bool "dst cache"
 	default n
 
+config NET_DEVLINK
+	tristate "Network physical/parent device Netlink interface"
+	help
+	  Network physical/parent device Netlink interface provides
+	  infrastructure to support access to physical chip-wide config and
+	  monitoring.
+
 endif   # if NET
 
 # Used by archs to tell that they support BPF_JIT
diff --git a/net/core/Makefile b/net/core/Makefile
index 7a8fb8aef992..014422e2561f 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
+obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/devlink.c b/net/core/devlink.c
new file mode 100644
index 000000000000..590fa561cb7f
--- /dev/null
+++ b/net/core/devlink.c
@@ -0,0 +1,738 @@
+/*
+ * net/core/devlink.c - Network physical/parent device Netlink interface
+ *
+ * Heavily inspired by net/wireless/
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <rdma/ib_verbs.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/devlink.h>
+
+static LIST_HEAD(devlink_list);
+
+/* devlink_mutex
+ *
+ * An overall lock guarding every operation coming from userspace.
+ * It also guards devlink devices list and it is taken when
+ * driver registers/unregisters it.
+ */
+static DEFINE_MUTEX(devlink_mutex);
+
+/* devlink_port_mutex
+ *
+ * Shared lock to guard lists of ports in all devlink devices.
+ */
+static DEFINE_MUTEX(devlink_port_mutex);
+
+static struct net *devlink_net(const struct devlink *devlink)
+{
+	return read_pnet(&devlink->_net);
+}
+
+static void devlink_net_set(struct devlink *devlink, struct net *net)
+{
+	write_pnet(&devlink->_net, net);
+}
+
+static struct devlink *devlink_get_from_attrs(struct net *net,
+					      struct nlattr **attrs)
+{
+	struct devlink *devlink;
+	char *busname;
+	char *devname;
+
+	if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME])
+		return ERR_PTR(-EINVAL);
+
+	busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]);
+	devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]);
+
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (strcmp(devlink->dev->bus->name, busname) == 0 &&
+		    strcmp(dev_name(devlink->dev), devname) == 0 &&
+		    net_eq(devlink_net(devlink), net))
+			return devlink;
+	}
+
+	return ERR_PTR(-ENODEV);
+}
+
+static struct devlink *devlink_get_from_info(struct genl_info *info)
+{
+	return devlink_get_from_attrs(genl_info_net(info), info->attrs);
+}
+
+static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
+						      int port_index)
+{
+	struct devlink_port *devlink_port;
+
+	list_for_each_entry(devlink_port, &devlink->port_list, list) {
+		if (devlink_port->index == port_index)
+			return devlink_port;
+	}
+	return NULL;
+}
+
+static bool devlink_port_index_exists(struct devlink *devlink, int port_index)
+{
+	return devlink_port_get_by_index(devlink, port_index);
+}
+
+static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
+							struct nlattr **attrs)
+{
+	if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+		u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+		struct devlink_port *devlink_port;
+
+		devlink_port = devlink_port_get_by_index(devlink, port_index);
+		if (!devlink_port)
+			return ERR_PTR(-ENODEV);
+		return devlink_port;
+	}
+	return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
+						       struct genl_info *info)
+{
+	return devlink_port_get_from_attrs(devlink, info->attrs);
+}
+
+#define DEVLINK_NL_FLAG_NEED_PORT	BIT(0)
+
+static int devlink_nl_pre_doit(const struct genl_ops *ops,
+			       struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink;
+
+	mutex_lock(&devlink_mutex);
+	devlink = devlink_get_from_info(info);
+	if (IS_ERR(devlink)) {
+		mutex_unlock(&devlink_mutex);
+		return PTR_ERR(devlink);
+	}
+	info->user_ptr[0] = devlink;
+	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
+		struct devlink_port *devlink_port;
+
+		mutex_lock(&devlink_port_mutex);
+		devlink_port = devlink_port_get_from_info(devlink, info);
+		if (IS_ERR(devlink_port)) {
+			mutex_unlock(&devlink_port_mutex);
+			mutex_unlock(&devlink_mutex);
+			return PTR_ERR(devlink_port);
+		}
+		info->user_ptr[1] = devlink_port;
+	}
+	return 0;
+}
+
+static void devlink_nl_post_doit(const struct genl_ops *ops,
+				 struct sk_buff *skb, struct genl_info *info)
+{
+	if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT)
+		mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink_mutex);
+}
+
+static struct genl_family devlink_nl_family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= DEVLINK_GENL_NAME,
+	.version	= DEVLINK_GENL_VERSION,
+	.maxattr	= DEVLINK_ATTR_MAX,
+	.netnsok	= true,
+	.pre_doit	= devlink_nl_pre_doit,
+	.post_doit	= devlink_nl_post_doit,
+};
+
+enum devlink_multicast_groups {
+	DEVLINK_MCGRP_CONFIG,
+};
+
+static const struct genl_multicast_group devlink_nl_mcgrps[] = {
+	[DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME },
+};
+
+static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+	if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name))
+		return -EMSGSIZE;
+	if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev)))
+		return -EMSGSIZE;
+	return 0;
+}
+
+static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
+			   enum devlink_command cmd, u32 portid,
+			   u32 seq, int flags)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
+{
+	struct sk_buff *msg;
+	int err;
+
+	WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
+				struct devlink_port *devlink_port,
+				enum devlink_command cmd, u32 portid,
+				u32 seq, int flags)
+{
+	void *hdr;
+
+	hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+	if (!hdr)
+		return -EMSGSIZE;
+
+	if (devlink_nl_put_handle(msg, devlink))
+		goto nla_put_failure;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+		goto nla_put_failure;
+	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
+		goto nla_put_failure;
+	if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
+	    nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE,
+			devlink_port->desired_type))
+		goto nla_put_failure;
+	if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
+		struct net_device *netdev = devlink_port->type_dev;
+
+		if (netdev &&
+		    (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
+				 netdev->ifindex) ||
+		     nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
+				    netdev->name)))
+			goto nla_put_failure;
+	}
+	if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
+		struct ib_device *ibdev = devlink_port->type_dev;
+
+		if (ibdev &&
+		    nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
+				   ibdev->name))
+			goto nla_put_failure;
+	}
+	if (devlink_port->split &&
+	    nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
+			devlink_port->split_group))
+		goto nla_put_failure;
+
+	genlmsg_end(msg, hdr);
+	return 0;
+
+nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+	return -EMSGSIZE;
+}
+
+static void devlink_port_notify(struct devlink_port *devlink_port,
+				enum devlink_command cmd)
+{
+	struct devlink *devlink = devlink_port->devlink;
+	struct sk_buff *msg;
+	int err;
+
+	if (!devlink_port->registered)
+		return;
+
+	WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return;
+
+	err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return;
+	}
+
+	genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+				msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+			      info->snd_portid, info->snd_seq, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
+				     struct netlink_callback *cb)
+{
+	struct devlink *devlink;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		if (idx < start) {
+			idx++;
+			continue;
+		}
+		err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+				      NETLINK_CB(cb->skb).portid,
+				      cb->nlh->nlmsg_seq, NLM_F_MULTI);
+		if (err)
+			goto out;
+		idx++;
+	}
+out:
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_port *devlink_port = info->user_ptr[1];
+	struct sk_buff *msg;
+	int err;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	if (!msg)
+		return -ENOMEM;
+
+	err = devlink_nl_port_fill(msg, devlink, devlink_port,
+				   DEVLINK_CMD_PORT_NEW,
+				   info->snd_portid, info->snd_seq, 0);
+	if (err) {
+		nlmsg_free(msg);
+		return err;
+	}
+
+	return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
+					  struct netlink_callback *cb)
+{
+	struct devlink *devlink;
+	struct devlink_port *devlink_port;
+	int start = cb->args[0];
+	int idx = 0;
+	int err;
+
+	mutex_lock(&devlink_mutex);
+	mutex_lock(&devlink_port_mutex);
+	list_for_each_entry(devlink, &devlink_list, list) {
+		if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+			continue;
+		list_for_each_entry(devlink_port, &devlink->port_list, list) {
+			if (idx < start) {
+				idx++;
+				continue;
+			}
+			err = devlink_nl_port_fill(msg, devlink, devlink_port,
+						   DEVLINK_CMD_NEW,
+						   NETLINK_CB(cb->skb).portid,
+						   cb->nlh->nlmsg_seq,
+						   NLM_F_MULTI);
+			if (err)
+				goto out;
+			idx++;
+		}
+	}
+out:
+	mutex_unlock(&devlink_port_mutex);
+	mutex_unlock(&devlink_mutex);
+
+	cb->args[0] = idx;
+	return msg->len;
+}
+
+static int devlink_port_type_set(struct devlink *devlink,
+				 struct devlink_port *devlink_port,
+				 enum devlink_port_type port_type)
+
+{
+	int err;
+
+	if (devlink->ops && devlink->ops->port_type_set) {
+		if (port_type == DEVLINK_PORT_TYPE_NOTSET)
+			return -EINVAL;
+		err = devlink->ops->port_type_set(devlink_port, port_type);
+		if (err)
+			return err;
+		devlink_port->desired_type = port_type;
+		devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+		return 0;
+	}
+	return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
+					struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	struct devlink_port *devlink_port = info->user_ptr[1];
+	int err;
+
+	if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
+		enum devlink_port_type port_type;
+
+		port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
+		err = devlink_port_type_set(devlink, devlink_port, port_type);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int devlink_port_split(struct devlink *devlink,
+			      u32 port_index, u32 count)
+
+{
+	if (devlink->ops && devlink->ops->port_split)
+		return devlink->ops->port_split(devlink, port_index, count);
+	return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
+					  struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	u32 port_index;
+	u32 count;
+
+	if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] ||
+	    !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT])
+		return -EINVAL;
+
+	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+	count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
+	return devlink_port_split(devlink, port_index, count);
+}
+
+static int devlink_port_unsplit(struct devlink *devlink, u32 port_index)
+
+{
+	if (devlink->ops && devlink->ops->port_unsplit)
+		return devlink->ops->port_unsplit(devlink, port_index);
+	return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
+					    struct genl_info *info)
+{
+	struct devlink *devlink = info->user_ptr[0];
+	u32 port_index;
+
+	if (!info->attrs[DEVLINK_ATTR_PORT_INDEX])
+		return -EINVAL;
+
+	port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+	return devlink_port_unsplit(devlink, port_index);
+}
+
+static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
+	[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
+	[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
+	[DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 },
+	[DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 },
+};
+
+static const struct genl_ops devlink_nl_ops[] = {
+	{
+		.cmd = DEVLINK_CMD_GET,
+		.doit = devlink_nl_cmd_get_doit,
+		.dumpit = devlink_nl_cmd_get_dumpit,
+		.policy = devlink_nl_policy,
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_GET,
+		.doit = devlink_nl_cmd_port_get_doit,
+		.dumpit = devlink_nl_cmd_port_get_dumpit,
+		.policy = devlink_nl_policy,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+		/* can be retrieved by unprivileged users */
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_SET,
+		.doit = devlink_nl_cmd_port_set_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+		.internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_SPLIT,
+		.doit = devlink_nl_cmd_port_split_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = DEVLINK_CMD_PORT_UNSPLIT,
+		.doit = devlink_nl_cmd_port_unsplit_doit,
+		.policy = devlink_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+};
+
+/**
+ *	devlink_alloc - Allocate new devlink instance resources
+ *
+ *	@ops: ops
+ *	@priv_size: size of user private data
+ *
+ *	Allocate new devlink instance resources, including devlink index
+ *	and name.
+ */
+struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
+{
+	struct devlink *devlink;
+
+	devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);
+	if (!devlink)
+		return NULL;
+	devlink->ops = ops;
+	devlink_net_set(devlink, &init_net);
+	INIT_LIST_HEAD(&devlink->port_list);
+	return devlink;
+}
+EXPORT_SYMBOL_GPL(devlink_alloc);
+
+/**
+ *	devlink_register - Register devlink instance
+ *
+ *	@devlink: devlink
+ */
+int devlink_register(struct devlink *devlink, struct device *dev)
+{
+	mutex_lock(&devlink_mutex);
+	devlink->dev = dev;
+	list_add_tail(&devlink->list, &devlink_list);
+	devlink_notify(devlink, DEVLINK_CMD_NEW);
+	mutex_unlock(&devlink_mutex);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_register);
+
+/**
+ *	devlink_unregister - Unregister devlink instance
+ *
+ *	@devlink: devlink
+ */
+void devlink_unregister(struct devlink *devlink)
+{
+	mutex_lock(&devlink_mutex);
+	devlink_notify(devlink, DEVLINK_CMD_DEL);
+	list_del(&devlink->list);
+	mutex_unlock(&devlink_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_unregister);
+
+/**
+ *	devlink_free - Free devlink instance resources
+ *
+ *	@devlink: devlink
+ */
+void devlink_free(struct devlink *devlink)
+{
+	kfree(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_free);
+
+/**
+ *	devlink_port_register - Register devlink port
+ *
+ *	@devlink: devlink
+ *	@devlink_port: devlink port
+ *	@port_index
+ *
+ *	Register devlink port with provided port index. User can use
+ *	any indexing, even hw-related one. devlink_port structure
+ *	is convenient to be embedded inside user driver private structure.
+ *	Note that the caller should take care of zeroing the devlink_port
+ *	structure.
+ */
+int devlink_port_register(struct devlink *devlink,
+			  struct devlink_port *devlink_port,
+			  unsigned int port_index)
+{
+	mutex_lock(&devlink_port_mutex);
+	if (devlink_port_index_exists(devlink, port_index)) {
+		mutex_unlock(&devlink_port_mutex);
+		return -EEXIST;
+	}
+	devlink_port->devlink = devlink;
+	devlink_port->index = port_index;
+	devlink_port->type = DEVLINK_PORT_TYPE_NOTSET;
+	devlink_port->registered = true;
+	list_add_tail(&devlink_port->list, &devlink->port_list);
+	mutex_unlock(&devlink_port_mutex);
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_register);
+
+/**
+ *	devlink_port_unregister - Unregister devlink port
+ *
+ *	@devlink_port: devlink port
+ */
+void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
+	mutex_lock(&devlink_port_mutex);
+	list_del(&devlink_port->list);
+	mutex_unlock(&devlink_port_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_port_unregister);
+
+static void __devlink_port_type_set(struct devlink_port *devlink_port,
+				    enum devlink_port_type type,
+				    void *type_dev)
+{
+	devlink_port->type = type;
+	devlink_port->type_dev = type_dev;
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+/**
+ *	devlink_port_type_eth_set - Set port type to Ethernet
+ *
+ *	@devlink_port: devlink port
+ *	@netdev: related netdevice
+ */
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+			       struct net_device *netdev)
+{
+	return __devlink_port_type_set(devlink_port,
+				       DEVLINK_PORT_TYPE_ETH, netdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
+
+/**
+ *	devlink_port_type_ib_set - Set port type to InfiniBand
+ *
+ *	@devlink_port: devlink port
+ *	@ibdev: related IB device
+ */
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+			      struct ib_device *ibdev)
+{
+	return __devlink_port_type_set(devlink_port,
+				       DEVLINK_PORT_TYPE_IB, ibdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
+
+/**
+ *	devlink_port_type_clear - Clear port type
+ *
+ *	@devlink_port: devlink port
+ */
+void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+	return __devlink_port_type_set(devlink_port,
+				       DEVLINK_PORT_TYPE_NOTSET, NULL);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_clear);
+
+/**
+ *	devlink_port_split_set - Set port is split
+ *
+ *	@devlink_port: devlink port
+ *	@split_group: split group - identifies group split port is part of
+ */
+void devlink_port_split_set(struct devlink_port *devlink_port,
+			    u32 split_group)
+{
+	devlink_port->split = true;
+	devlink_port->split_group = split_group;
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+EXPORT_SYMBOL_GPL(devlink_port_split_set);
+
+static int __init devlink_module_init(void)
+{
+	return genl_register_family_with_ops_groups(&devlink_nl_family,
+						    devlink_nl_ops,
+						    devlink_nl_mcgrps);
+}
+
+static void __exit devlink_module_exit(void)
+{
+	genl_unregister_family(&devlink_nl_family);
+}
+
+module_init(devlink_module_init);
+module_exit(devlink_module_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Network physical device Netlink interface");
+MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME);
-- 
cgit v1.2.3


From 7f0aec7a668419bdbff12de6e8016544f874e708 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 26 Feb 2016 21:20:01 +0100
Subject: bridge: mcast: use names for the different multicast_router types

Using raw values makes it difficult to extend and also understand the
code, give them names and do explicit per-option manipulation in
br_multicast_set_port_router.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  7 +++++
 net/bridge/br_multicast.c      | 61 +++++++++++++++++++++++-------------------
 2 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 0890b217580d..e47f3bc7f323 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -177,6 +177,13 @@ enum {
 };
 #define MDBA_MDB_EATTR_MAX (__MDBA_MDB_EATTR_MAX - 1)
 
+/* multicast router types */
+enum {
+	MDB_RTR_TYPE_DISABLED,
+	MDB_RTR_TYPE_TEMP_QUERY,
+	MDB_RTR_TYPE_PERM,
+};
+
 enum {
 	MDBA_ROUTER_UNSPEC,
 	MDBA_ROUTER_PORT,
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 8b6e4249be1b..71c109b0943f 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -759,7 +759,7 @@ static void br_multicast_router_expired(unsigned long data)
 	struct net_bridge *br = port->br;
 
 	spin_lock(&br->multicast_lock);
-	if (port->multicast_router != 1 ||
+	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY ||
 	    timer_pending(&port->multicast_router_timer) ||
 	    hlist_unhashed(&port->rlist))
 		goto out;
@@ -912,7 +912,7 @@ static void br_ip6_multicast_port_query_expired(unsigned long data)
 
 void br_multicast_add_port(struct net_bridge_port *port)
 {
-	port->multicast_router = 1;
+	port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 
 	setup_timer(&port->multicast_router_timer, br_multicast_router_expired,
 		    (unsigned long)port);
@@ -959,7 +959,8 @@ void br_multicast_enable_port(struct net_bridge_port *port)
 #if IS_ENABLED(CONFIG_IPV6)
 	br_multicast_enable(&port->ip6_own_query);
 #endif
-	if (port->multicast_router == 2 && hlist_unhashed(&port->rlist))
+	if (port->multicast_router == MDB_RTR_TYPE_PERM &&
+	    hlist_unhashed(&port->rlist))
 		br_multicast_add_router(br, port);
 
 out:
@@ -1227,13 +1228,13 @@ static void br_multicast_mark_router(struct net_bridge *br,
 	unsigned long now = jiffies;
 
 	if (!port) {
-		if (br->multicast_router == 1)
+		if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY)
 			mod_timer(&br->multicast_router_timer,
 				  now + br->multicast_querier_interval);
 		return;
 	}
 
-	if (port->multicast_router != 1)
+	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
 		return;
 
 	br_multicast_add_router(br, port);
@@ -1713,7 +1714,7 @@ void br_multicast_init(struct net_bridge *br)
 	br->hash_elasticity = 4;
 	br->hash_max = 512;
 
-	br->multicast_router = 1;
+	br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 	br->multicast_querier = 0;
 	br->multicast_query_use_ifaddr = 0;
 	br->multicast_last_member_count = 2;
@@ -1823,11 +1824,11 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
 	spin_lock_bh(&br->multicast_lock);
 
 	switch (val) {
-	case 0:
-	case 2:
+	case MDB_RTR_TYPE_DISABLED:
+	case MDB_RTR_TYPE_PERM:
 		del_timer(&br->multicast_router_timer);
 		/* fall through */
-	case 1:
+	case MDB_RTR_TYPE_TEMP_QUERY:
 		br->multicast_router = val;
 		err = 0;
 		break;
@@ -1838,6 +1839,14 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
 	return err;
 }
 
+static void __del_port_router(struct net_bridge_port *p)
+{
+	if (hlist_unhashed(&p->rlist))
+		return;
+	hlist_del_init_rcu(&p->rlist);
+	br_rtr_notify(p->br->dev, p, RTM_DELMDB);
+}
+
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 {
 	struct net_bridge *br = p->br;
@@ -1846,29 +1855,25 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 	spin_lock(&br->multicast_lock);
 
 	switch (val) {
-	case 0:
-	case 1:
-	case 2:
-		p->multicast_router = val;
-		err = 0;
-
-		if (val < 2 && !hlist_unhashed(&p->rlist)) {
-			hlist_del_init_rcu(&p->rlist);
-			br_rtr_notify(br->dev, p, RTM_DELMDB);
-		}
-
-		if (val == 1)
-			break;
-
+	case MDB_RTR_TYPE_DISABLED:
+		p->multicast_router = MDB_RTR_TYPE_DISABLED;
+		__del_port_router(p);
+		del_timer(&p->multicast_router_timer);
+		break;
+	case MDB_RTR_TYPE_TEMP_QUERY:
+		p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
+		__del_port_router(p);
+		break;
+	case MDB_RTR_TYPE_PERM:
+		p->multicast_router = MDB_RTR_TYPE_PERM;
 		del_timer(&p->multicast_router_timer);
-
-		if (val == 0)
-			break;
-
 		br_multicast_add_router(br, p);
 		break;
+	default:
+		goto unlock;
 	}
-
+	err = 0;
+unlock:
 	spin_unlock(&br->multicast_lock);
 
 	return err;
-- 
cgit v1.2.3


From a55d8246abcc910346771175b521ee2bce5a69b3 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 26 Feb 2016 21:20:03 +0100
Subject: bridge: mcast: add support for temporary port router

Add support for a temporary router port which doesn't depend only on the
incoming query. It can be refreshed if set to the same value, which is
a no-op for the rest.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h |  1 +
 net/bridge/br_multicast.c      | 21 +++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index e47f3bc7f323..74ee03a47e79 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -182,6 +182,7 @@ enum {
 	MDB_RTR_TYPE_DISABLED,
 	MDB_RTR_TYPE_TEMP_QUERY,
 	MDB_RTR_TYPE_PERM,
+	MDB_RTR_TYPE_TEMP
 };
 
 enum {
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index f1140cf5168d..a4c15df2b792 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -759,13 +759,17 @@ static void br_multicast_router_expired(unsigned long data)
 	struct net_bridge *br = port->br;
 
 	spin_lock(&br->multicast_lock);
-	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY ||
+	if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
+	    port->multicast_router == MDB_RTR_TYPE_PERM ||
 	    timer_pending(&port->multicast_router_timer) ||
 	    hlist_unhashed(&port->rlist))
 		goto out;
 
 	hlist_del_init_rcu(&port->rlist);
 	br_rtr_notify(br->dev, port, RTM_DELMDB);
+	/* Don't allow timer refresh if the router expired */
+	if (port->multicast_router == MDB_RTR_TYPE_TEMP)
+		port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 
 out:
 	spin_unlock(&br->multicast_lock);
@@ -981,6 +985,9 @@ void br_multicast_disable_port(struct net_bridge_port *port)
 	if (!hlist_unhashed(&port->rlist)) {
 		hlist_del_init_rcu(&port->rlist);
 		br_rtr_notify(br->dev, port, RTM_DELMDB);
+		/* Don't allow timer refresh if disabling */
+		if (port->multicast_router == MDB_RTR_TYPE_TEMP)
+			port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
 	}
 	del_timer(&port->multicast_router_timer);
 	del_timer(&port->ip4_own_query.timer);
@@ -1234,7 +1241,8 @@ static void br_multicast_mark_router(struct net_bridge *br,
 		return;
 	}
 
-	if (port->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
+	if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
+	    port->multicast_router == MDB_RTR_TYPE_PERM)
 		return;
 
 	br_multicast_add_router(br, port);
@@ -1850,10 +1858,15 @@ static void __del_port_router(struct net_bridge_port *p)
 int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 {
 	struct net_bridge *br = p->br;
+	unsigned long now = jiffies;
 	int err = -EINVAL;
 
 	spin_lock(&br->multicast_lock);
 	if (p->multicast_router == val) {
+		/* Refresh the temp router port timer */
+		if (p->multicast_router == MDB_RTR_TYPE_TEMP)
+			mod_timer(&p->multicast_router_timer,
+				  now + br->multicast_querier_interval);
 		err = 0;
 		goto unlock;
 	}
@@ -1872,6 +1885,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
 		del_timer(&p->multicast_router_timer);
 		br_multicast_add_router(br, p);
 		break;
+	case MDB_RTR_TYPE_TEMP:
+		p->multicast_router = MDB_RTR_TYPE_TEMP;
+		br_multicast_mark_router(br, p);
+		break;
 	default:
 		goto unlock;
 	}
-- 
cgit v1.2.3


From 59f78f9f6c2e80dcf0f520be85b660f856217b79 Mon Sep 17 00:00:00 2001
From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Date: Fri, 26 Feb 2016 21:20:04 +0100
Subject: bridge: mcast: add support for more router port information dumping

Allow for more multicast router port information to be dumped such as
timer and type attributes. For that that purpose we need to extend the
MDBA_ROUTER_PORT attribute similar to how it was done for the mdb entries
recently. The new format is thus:
[MDBA_ROUTER_PORT] = { <- nested attribute
    u32 ifindex <- router port ifindex for user-space compatibility
    [MDBA_ROUTER_PATTR attributes]
}
This way it remains compatible with older users (they'll simply retrieve
the u32 in the beginning) and new users can parse the remaining
attributes. It would also allow to add future extensions to the router
port without breaking compatibility.

Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_bridge.h | 14 +++++++++++++-
 net/bridge/br_mdb.c            | 16 ++++++++++++++--
 2 files changed, 27 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h
index 74ee03a47e79..0536eefff9bf 100644
--- a/include/uapi/linux/if_bridge.h
+++ b/include/uapi/linux/if_bridge.h
@@ -144,7 +144,10 @@ struct bridge_vlan_info {
  *     }
  * }
  * [MDBA_ROUTER] = {
- *    [MDBA_ROUTER_PORT]
+ *    [MDBA_ROUTER_PORT] = {
+ *        u32 ifindex
+ *        [MDBA_ROUTER_PATTR attributes]
+ *    }
  * }
  */
 enum {
@@ -192,6 +195,15 @@ enum {
 };
 #define MDBA_ROUTER_MAX (__MDBA_ROUTER_MAX - 1)
 
+/* router port attributes */
+enum {
+	MDBA_ROUTER_PATTR_UNSPEC,
+	MDBA_ROUTER_PATTR_TIMER,
+	MDBA_ROUTER_PATTR_TYPE,
+	__MDBA_ROUTER_PATTR_MAX
+};
+#define MDBA_ROUTER_PATTR_MAX (__MDBA_ROUTER_PATTR_MAX - 1)
+
 struct br_port_msg {
 	__u8  family;
 	__u32 ifindex;
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 73786e2fe065..253bc77eda3b 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -20,7 +20,7 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 {
 	struct net_bridge *br = netdev_priv(dev);
 	struct net_bridge_port *p;
-	struct nlattr *nest;
+	struct nlattr *nest, *port_nest;
 
 	if (!br->multicast_router || hlist_empty(&br->router_list))
 		return 0;
@@ -30,8 +30,20 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
 		return -EMSGSIZE;
 
 	hlist_for_each_entry_rcu(p, &br->router_list, rlist) {
-		if (p && nla_put_u32(skb, MDBA_ROUTER_PORT, p->dev->ifindex))
+		if (!p)
+			continue;
+		port_nest = nla_nest_start(skb, MDBA_ROUTER_PORT);
+		if (!port_nest)
 			goto fail;
+		if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) ||
+		    nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER,
+				br_timer_value(&p->multicast_router_timer)) ||
+		    nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE,
+			       p->multicast_router)) {
+			nla_nest_cancel(skb, port_nest);
+			goto fail;
+		}
+		nla_nest_end(skb, port_nest);
 	}
 
 	nla_nest_end(skb, nest);
-- 
cgit v1.2.3


From ef6980b6becb1afd9d82a4f043749a10ae81bf14 Mon Sep 17 00:00:00 2001
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: Sat, 27 Feb 2016 08:08:54 -0500
Subject: introduce IFE action

This action allows for a sending side to encapsulate arbitrary metadata
which is decapsulated by the receiving end.
The sender runs in encoding mode and the receiver in decode mode.
Both sender and receiver must specify the same ethertype.
At some point we hope to have a registered ethertype and we'll
then provide a default so the user doesnt have to specify it.
For now we enforce the user specify it.

Lets show example usage where we encode icmp from a sender towards
a receiver with an skbmark of 17; both sender and receiver use
ethertype of 0xdead to interop.

YYYY: Lets start with Receiver-side policy config:
xxx: add an ingress qdisc
sudo tc qdisc add dev $ETH ingress

xxx: any packets with ethertype 0xdead will be subjected to ife decoding
xxx: we then restart the classification so we can match on icmp at prio 3
sudo $TC filter add dev $ETH parent ffff: prio 2 protocol 0xdead \
u32 match u32 0 0 flowid 1:1 \
action ife decode reclassify

xxx: on restarting the classification from above if it was an icmp
xxx: packet, then match it here and continue to the next rule at prio 4
xxx: which will match based on skb mark of 17
sudo tc filter add dev $ETH parent ffff: prio 3 protocol ip \
u32 match ip protocol 1 0xff flowid 1:1 \
action continue

xxx: match on skbmark of 0x11 (decimal 17) and accept
sudo tc filter add dev $ETH parent ffff: prio 4 protocol ip \
handle 0x11 fw flowid 1:1 \
action ok

xxx: Lets show the decoding policy
sudo tc -s filter ls dev $ETH parent ffff: protocol 0xdead
xxx:
filter pref 2 u32
filter pref 2 u32 fh 800: ht divisor 1
filter pref 2 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:1  (rule hit 0 success 0)
  match 00000000/00000000 at 0 (success 0 )
        action order 1: ife decode action reclassify
         index 1 ref 1 bind 1 installed 14 sec used 14 sec
         type: 0x0
         Metadata: allow mark allow hash allow prio allow qmap
        Action statistics:
        Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
        backlog 0b 0p requeues 0
xxx:
Observe that above lists all metadatum it can decode. Typically these
submodules will already be compiled into a monolithic kernel or
loaded as modules

YYYY: Lets show the sender side now ..

xxx: Add an egress qdisc on the sender netdev
sudo tc qdisc add dev $ETH root handle 1: prio
xxx:
xxx: Match all icmp packets to 192.168.122.237/24, then
xxx: tag the packet with skb mark of decimal 17, then
xxx: Encode it with:
xxx:	ethertype 0xdead
xxx:	add skb->mark to whitelist of metadatum to send
xxx:	rewrite target dst MAC address to 02:15:15:15:15:15
xxx:
sudo $TC filter add dev $ETH parent 1: protocol ip prio 10  u32 \
match ip dst 192.168.122.237/24 \
match ip protocol 1 0xff \
flowid 1:2 \
action skbedit mark 17 \
action ife encode \
type 0xDEAD \
allow mark \
dst 02:15:15:15:15:15

xxx: Lets show the encoding policy
sudo tc -s filter ls dev $ETH parent 1: protocol ip
xxx:
filter pref 10 u32
filter pref 10 u32 fh 800: ht divisor 1
filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:2  (rule hit 0 success 0)
  match c0a87aed/ffffffff at 16 (success 0 )
  match 00010000/00ff0000 at 8 (success 0 )

	action order 1:  skbedit mark 17
	 index 6 ref 1 bind 1
 	Action statistics:
	Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
	backlog 0b 0p requeues 0

	action order 2: ife encode action pipe
	 index 3 ref 1 bind 1
	 dst MAC: 02:15:15:15:15:15 type: 0xDEAD
 	 Metadata: allow mark
 	Action statistics:
	Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
	backlog 0b 0p requeues 0
xxx:

test by sending ping from sender to destination

Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_act/tc_ife.h        |  61 +++
 include/uapi/linux/tc_act/tc_ife.h |  38 ++
 net/sched/Kconfig                  |  12 +
 net/sched/Makefile                 |   1 +
 net/sched/act_ife.c                | 870 +++++++++++++++++++++++++++++++++++++
 5 files changed, 982 insertions(+)
 create mode 100644 include/net/tc_act/tc_ife.h
 create mode 100644 include/uapi/linux/tc_act/tc_ife.h
 create mode 100644 net/sched/act_ife.c

(limited to 'include/uapi/linux')

diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h
new file mode 100644
index 000000000000..dc9a09aefb33
--- /dev/null
+++ b/include/net/tc_act/tc_ife.h
@@ -0,0 +1,61 @@
+#ifndef __NET_TC_IFE_H
+#define __NET_TC_IFE_H
+
+#include <net/act_api.h>
+#include <linux/etherdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+
+#define IFE_METAHDRLEN 2
+struct tcf_ife_info {
+	struct tcf_common common;
+	u8 eth_dst[ETH_ALEN];
+	u8 eth_src[ETH_ALEN];
+	u16 eth_type;
+	u16 flags;
+	/* list of metaids allowed */
+	struct list_head metalist;
+};
+#define to_ife(a) \
+	container_of(a->priv, struct tcf_ife_info, common)
+
+struct tcf_meta_info {
+	const struct tcf_meta_ops *ops;
+	void *metaval;
+	u16 metaid;
+	struct list_head metalist;
+};
+
+struct tcf_meta_ops {
+	u16 metaid; /*Maintainer provided ID */
+	u16 metatype; /*netlink attribute type (look at net/netlink.h) */
+	const char *name;
+	const char *synopsis;
+	struct list_head list;
+	int	(*check_presence)(struct sk_buff *, struct tcf_meta_info *);
+	int	(*encode)(struct sk_buff *, void *, struct tcf_meta_info *);
+	int	(*decode)(struct sk_buff *, void *, u16 len);
+	int	(*get)(struct sk_buff *skb, struct tcf_meta_info *mi);
+	int	(*alloc)(struct tcf_meta_info *, void *);
+	void	(*release)(struct tcf_meta_info *);
+	int	(*validate)(void *val, int len);
+	struct module	*owner;
+};
+
+#define MODULE_ALIAS_IFE_META(metan)   MODULE_ALIAS("ifemeta" __stringify_1(metan))
+
+int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi);
+int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi);
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen,
+			const void *dval);
+int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval);
+int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval);
+int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi);
+int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi);
+int ife_validate_meta_u32(void *val, int len);
+int ife_validate_meta_u16(void *val, int len);
+void ife_release_meta_gen(struct tcf_meta_info *mi);
+int register_ife_op(struct tcf_meta_ops *mops);
+int unregister_ife_op(struct tcf_meta_ops *mops);
+
+#endif /* __NET_TC_IFE_H */
diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h
new file mode 100644
index 000000000000..d648ff66586f
--- /dev/null
+++ b/include/uapi/linux/tc_act/tc_ife.h
@@ -0,0 +1,38 @@
+#ifndef __UAPI_TC_IFE_H
+#define __UAPI_TC_IFE_H
+
+#include <linux/types.h>
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_IFE 25
+/* Flag bits for now just encoding/decoding; mutually exclusive */
+#define IFE_ENCODE 1
+#define IFE_DECODE 0
+
+struct tc_ife {
+	tc_gen;
+	__u16 flags;
+};
+
+/*XXX: We need to encode the total number of bytes consumed */
+enum {
+	TCA_IFE_UNSPEC,
+	TCA_IFE_PARMS,
+	TCA_IFE_TM,
+	TCA_IFE_DMAC,
+	TCA_IFE_SMAC,
+	TCA_IFE_TYPE,
+	TCA_IFE_METALST,
+	__TCA_IFE_MAX
+};
+#define TCA_IFE_MAX (__TCA_IFE_MAX - 1)
+
+#define IFE_META_SKBMARK 1
+#define IFE_META_HASHID 2
+#define	IFE_META_PRIO 3
+#define	IFE_META_QMAP 4
+/*Can be overridden at runtime by module option*/
+#define	__IFE_META_MAX 5
+#define IFE_META_MAX (__IFE_META_MAX - 1)
+
+#endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 82830824fb1f..4d48ef57e564 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -739,6 +739,18 @@ config NET_ACT_CONNMARK
 	  To compile this code as a module, choose M here: the
 	  module will be called act_connmark.
 
+config NET_ACT_IFE
+        tristate "Inter-FE action based on IETF ForCES InterFE LFB"
+        depends on NET_CLS_ACT
+        ---help---
+	  Say Y here to allow for sourcing and terminating metadata
+	  For details refer to netdev01 paper:
+	  "Distributing Linux Traffic Control Classifier-Action Subsystem"
+	   Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_ife.
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 690c1689e090..3d176671b0e1 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
 obj-$(CONFIG_NET_ACT_VLAN)	+= act_vlan.o
 obj-$(CONFIG_NET_ACT_BPF)	+= act_bpf.o
 obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
+obj-$(CONFIG_NET_ACT_IFE)	+= act_ife.o
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
new file mode 100644
index 000000000000..6e7ec257790d
--- /dev/null
+++ b/net/sched/act_ife.c
@@ -0,0 +1,870 @@
+/*
+ * net/sched/ife.c	Inter-FE action based on ForCES WG InterFE LFB
+ *
+ *		Refer to:
+ *		draft-ietf-forces-interfelfb-03
+ *		and
+ *		netdev01 paper:
+ *		"Distributing Linux Traffic Control Classifier-Action
+ *		Subsystem"
+ *		Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2015)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/etherdevice.h>
+
+#define IFE_TAB_MASK 15
+
+static int ife_net_id;
+static int max_metacnt = IFE_META_MAX + 1;
+
+static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
+	[TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)},
+	[TCA_IFE_DMAC] = { .len = ETH_ALEN},
+	[TCA_IFE_SMAC] = { .len = ETH_ALEN},
+	[TCA_IFE_TYPE] = { .type = NLA_U16},
+};
+
+/* Caller takes care of presenting data in network order
+*/
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
+{
+	u32 *tlv = (u32 *)(skbdata);
+	u16 totlen = nla_total_size(dlen);	/*alignment + hdr */
+	char *dptr = (char *)tlv + NLA_HDRLEN;
+	u32 htlv = attrtype << 16 | totlen;
+
+	*tlv = htonl(htlv);
+	memset(dptr, 0, totlen - NLA_HDRLEN);
+	memcpy(dptr, dval, dlen);
+
+	return totlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+
+int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi)
+{
+	if (mi->metaval)
+		return nla_put_u32(skb, mi->metaid, *(u32 *)mi->metaval);
+	else
+		return nla_put(skb, mi->metaid, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(ife_get_meta_u32);
+
+int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi)
+{
+	if (metaval || mi->metaval)
+		return 8; /* T+L+V == 2+2+4 */
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_check_meta_u32);
+
+int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi)
+{
+	u32 edata = metaval;
+
+	if (mi->metaval)
+		edata = *(u32 *)mi->metaval;
+	else if (metaval)
+		edata = metaval;
+
+	if (!edata) /* will not encode */
+		return 0;
+
+	edata = htonl(edata);
+	return ife_tlv_meta_encode(skbdata, mi->metaid, 4, &edata);
+}
+EXPORT_SYMBOL_GPL(ife_encode_meta_u32);
+
+int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi)
+{
+	if (mi->metaval)
+		return nla_put_u16(skb, mi->metaid, *(u16 *)mi->metaval);
+	else
+		return nla_put(skb, mi->metaid, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(ife_get_meta_u16);
+
+int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval)
+{
+	mi->metaval = kmemdup(&metaval, sizeof(u32), GFP_KERNEL);
+	if (!mi->metaval)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_alloc_meta_u32);
+
+int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval)
+{
+	mi->metaval = kmemdup(&metaval, sizeof(u16), GFP_KERNEL);
+	if (!mi->metaval)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ife_alloc_meta_u16);
+
+void ife_release_meta_gen(struct tcf_meta_info *mi)
+{
+	kfree(mi->metaval);
+}
+EXPORT_SYMBOL_GPL(ife_release_meta_gen);
+
+int ife_validate_meta_u32(void *val, int len)
+{
+	if (len == 4)
+		return 0;
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ife_validate_meta_u32);
+
+int ife_validate_meta_u16(void *val, int len)
+{
+	/* length will include padding */
+	if (len == NLA_ALIGN(2))
+		return 0;
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ife_validate_meta_u16);
+
+static LIST_HEAD(ifeoplist);
+static DEFINE_RWLOCK(ife_mod_lock);
+
+static struct tcf_meta_ops *find_ife_oplist(u16 metaid)
+{
+	struct tcf_meta_ops *o;
+
+	read_lock(&ife_mod_lock);
+	list_for_each_entry(o, &ifeoplist, list) {
+		if (o->metaid == metaid) {
+			if (!try_module_get(o->owner))
+				o = NULL;
+			read_unlock(&ife_mod_lock);
+			return o;
+		}
+	}
+	read_unlock(&ife_mod_lock);
+
+	return NULL;
+}
+
+int register_ife_op(struct tcf_meta_ops *mops)
+{
+	struct tcf_meta_ops *m;
+
+	if (!mops->metaid || !mops->metatype || !mops->name ||
+	    !mops->check_presence || !mops->encode || !mops->decode ||
+	    !mops->get || !mops->alloc)
+		return -EINVAL;
+
+	write_lock(&ife_mod_lock);
+
+	list_for_each_entry(m, &ifeoplist, list) {
+		if (m->metaid == mops->metaid ||
+		    (strcmp(mops->name, m->name) == 0)) {
+			write_unlock(&ife_mod_lock);
+			return -EEXIST;
+		}
+	}
+
+	if (!mops->release)
+		mops->release = ife_release_meta_gen;
+
+	list_add_tail(&mops->list, &ifeoplist);
+	write_unlock(&ife_mod_lock);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ife_op);
+
+int unregister_ife_op(struct tcf_meta_ops *mops)
+{
+	struct tcf_meta_ops *m;
+	int err = -ENOENT;
+
+	write_lock(&ife_mod_lock);
+	list_for_each_entry(m, &ifeoplist, list) {
+		if (m->metaid == mops->metaid) {
+			list_del(&mops->list);
+			err = 0;
+			break;
+		}
+	}
+	write_unlock(&ife_mod_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(register_ife_op);
+
+static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len)
+{
+	int ret = 0;
+	/* XXX: unfortunately cant use nla_policy at this point
+	* because a length of 0 is valid in the case of
+	* "allow". "use" semantics do enforce for proper
+	* length and i couldve use nla_policy but it makes it hard
+	* to use it just for that..
+	*/
+	if (ops->validate)
+		return ops->validate(val, len);
+
+	if (ops->metatype == NLA_U32)
+		ret = ife_validate_meta_u32(val, len);
+	else if (ops->metatype == NLA_U16)
+		ret = ife_validate_meta_u16(val, len);
+
+	return ret;
+}
+
+/* called when adding new meta information
+ * under ife->tcf_lock
+*/
+static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
+				void *val, int len)
+{
+	struct tcf_meta_ops *ops = find_ife_oplist(metaid);
+	int ret = 0;
+
+	if (!ops) {
+		ret = -ENOENT;
+#ifdef CONFIG_MODULES
+		spin_unlock_bh(&ife->tcf_lock);
+		rtnl_unlock();
+		request_module("ifemeta%u", metaid);
+		rtnl_lock();
+		spin_lock_bh(&ife->tcf_lock);
+		ops = find_ife_oplist(metaid);
+#endif
+	}
+
+	if (ops) {
+		ret = 0;
+		if (len)
+			ret = ife_validate_metatype(ops, val, len);
+
+		module_put(ops->owner);
+	}
+
+	return ret;
+}
+
+/* called when adding new meta information
+ * under ife->tcf_lock
+*/
+static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
+			int len)
+{
+	struct tcf_meta_info *mi = NULL;
+	struct tcf_meta_ops *ops = find_ife_oplist(metaid);
+	int ret = 0;
+
+	if (!ops)
+		return -ENOENT;
+
+	mi = kzalloc(sizeof(*mi), GFP_KERNEL);
+	if (!mi) {
+		/*put back what find_ife_oplist took */
+		module_put(ops->owner);
+		return -ENOMEM;
+	}
+
+	mi->metaid = metaid;
+	mi->ops = ops;
+	if (len > 0) {
+		ret = ops->alloc(mi, metaval);
+		if (ret != 0) {
+			kfree(mi);
+			module_put(ops->owner);
+			return ret;
+		}
+	}
+
+	list_add_tail(&mi->metalist, &ife->metalist);
+
+	return ret;
+}
+
+static int use_all_metadata(struct tcf_ife_info *ife)
+{
+	struct tcf_meta_ops *o;
+	int rc = 0;
+	int installed = 0;
+
+	list_for_each_entry(o, &ifeoplist, list) {
+		rc = add_metainfo(ife, o->metaid, NULL, 0);
+		if (rc == 0)
+			installed += 1;
+	}
+
+	if (installed)
+		return 0;
+	else
+		return -EINVAL;
+}
+
+static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife)
+{
+	struct tcf_meta_info *e;
+	struct nlattr *nest;
+	unsigned char *b = skb_tail_pointer(skb);
+	int total_encoded = 0;
+
+	/*can only happen on decode */
+	if (list_empty(&ife->metalist))
+		return 0;
+
+	nest = nla_nest_start(skb, TCA_IFE_METALST);
+	if (!nest)
+		goto out_nlmsg_trim;
+
+	list_for_each_entry(e, &ife->metalist, metalist) {
+		if (!e->ops->get(skb, e))
+			total_encoded += 1;
+	}
+
+	if (!total_encoded)
+		goto out_nlmsg_trim;
+
+	nla_nest_end(skb, nest);
+
+	return 0;
+
+out_nlmsg_trim:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+/* under ife->tcf_lock */
+static void _tcf_ife_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_ife_info *ife = a->priv;
+	struct tcf_meta_info *e, *n;
+
+	list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+		module_put(e->ops->owner);
+		list_del(&e->metalist);
+		if (e->metaval) {
+			if (e->ops->release)
+				e->ops->release(e);
+			else
+				kfree(e->metaval);
+		}
+		kfree(e);
+	}
+}
+
+static void tcf_ife_cleanup(struct tc_action *a, int bind)
+{
+	struct tcf_ife_info *ife = a->priv;
+
+	spin_lock_bh(&ife->tcf_lock);
+	_tcf_ife_cleanup(a, bind);
+	spin_unlock_bh(&ife->tcf_lock);
+}
+
+/* under ife->tcf_lock */
+static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb)
+{
+	int len = 0;
+	int rc = 0;
+	int i = 0;
+	void *val;
+
+	for (i = 1; i < max_metacnt; i++) {
+		if (tb[i]) {
+			val = nla_data(tb[i]);
+			len = nla_len(tb[i]);
+
+			rc = load_metaops_and_vet(ife, i, val, len);
+			if (rc != 0)
+				return rc;
+
+			rc = add_metainfo(ife, i, val, len);
+			if (rc)
+				return rc;
+		}
+	}
+
+	return rc;
+}
+
+static int tcf_ife_init(struct net *net, struct nlattr *nla,
+			struct nlattr *est, struct tc_action *a,
+			int ovr, int bind)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+	struct nlattr *tb[TCA_IFE_MAX + 1];
+	struct nlattr *tb2[IFE_META_MAX + 1];
+	struct tcf_ife_info *ife;
+	struct tc_ife *parm;
+	u16 ife_type = 0;
+	u8 *daddr = NULL;
+	u8 *saddr = NULL;
+	int ret = 0;
+	int err;
+
+	err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy);
+	if (err < 0)
+		return err;
+
+	if (!tb[TCA_IFE_PARMS])
+		return -EINVAL;
+
+	parm = nla_data(tb[TCA_IFE_PARMS]);
+
+	if (parm->flags & IFE_ENCODE) {
+		/* Until we get issued the ethertype, we cant have
+		 * a default..
+		**/
+		if (!tb[TCA_IFE_TYPE]) {
+			pr_info("You MUST pass etherype for encoding\n");
+			return -EINVAL;
+		}
+	}
+
+	if (!tcf_hash_check(tn, parm->index, a, bind)) {
+		ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife),
+				      bind, false);
+		if (ret)
+			return ret;
+		ret = ACT_P_CREATED;
+	} else {
+		if (bind)	/* dont override defaults */
+			return 0;
+		tcf_hash_release(a, bind);
+		if (!ovr)
+			return -EEXIST;
+	}
+
+	ife = to_ife(a);
+	ife->flags = parm->flags;
+
+	if (parm->flags & IFE_ENCODE) {
+		ife_type = nla_get_u16(tb[TCA_IFE_TYPE]);
+		if (tb[TCA_IFE_DMAC])
+			daddr = nla_data(tb[TCA_IFE_DMAC]);
+		if (tb[TCA_IFE_SMAC])
+			saddr = nla_data(tb[TCA_IFE_SMAC]);
+	}
+
+	spin_lock_bh(&ife->tcf_lock);
+	ife->tcf_action = parm->action;
+
+	if (parm->flags & IFE_ENCODE) {
+		if (daddr)
+			ether_addr_copy(ife->eth_dst, daddr);
+		else
+			eth_zero_addr(ife->eth_dst);
+
+		if (saddr)
+			ether_addr_copy(ife->eth_src, saddr);
+		else
+			eth_zero_addr(ife->eth_src);
+
+		ife->eth_type = ife_type;
+	}
+
+	if (ret == ACT_P_CREATED)
+		INIT_LIST_HEAD(&ife->metalist);
+
+	if (tb[TCA_IFE_METALST]) {
+		err = nla_parse_nested(tb2, IFE_META_MAX, tb[TCA_IFE_METALST],
+				       NULL);
+		if (err) {
+metadata_parse_err:
+			if (ret == ACT_P_CREATED)
+				_tcf_ife_cleanup(a, bind);
+
+			spin_unlock_bh(&ife->tcf_lock);
+			return err;
+		}
+
+		err = populate_metalist(ife, tb2);
+		if (err)
+			goto metadata_parse_err;
+
+	} else {
+		/* if no passed metadata allow list or passed allow-all
+		 * then here we process by adding as many supported metadatum
+		 * as we can. You better have at least one else we are
+		 * going to bail out
+		 */
+		err = use_all_metadata(ife);
+		if (err) {
+			if (ret == ACT_P_CREATED)
+				_tcf_ife_cleanup(a, bind);
+
+			spin_unlock_bh(&ife->tcf_lock);
+			return err;
+		}
+	}
+
+	spin_unlock_bh(&ife->tcf_lock);
+
+	if (ret == ACT_P_CREATED)
+		tcf_hash_insert(tn, a);
+
+	return ret;
+}
+
+static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+			int ref)
+{
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tcf_ife_info *ife = a->priv;
+	struct tc_ife opt = {
+		.index = ife->tcf_index,
+		.refcnt = ife->tcf_refcnt - ref,
+		.bindcnt = ife->tcf_bindcnt - bind,
+		.action = ife->tcf_action,
+		.flags = ife->flags,
+	};
+	struct tcf_t t;
+
+	if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt))
+		goto nla_put_failure;
+
+	t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install);
+	t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse);
+	t.expires = jiffies_to_clock_t(ife->tcf_tm.expires);
+	if (nla_put(skb, TCA_IFE_TM, sizeof(t), &t))
+		goto nla_put_failure;
+
+	if (!is_zero_ether_addr(ife->eth_dst)) {
+		if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, ife->eth_dst))
+			goto nla_put_failure;
+	}
+
+	if (!is_zero_ether_addr(ife->eth_src)) {
+		if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, ife->eth_src))
+			goto nla_put_failure;
+	}
+
+	if (nla_put(skb, TCA_IFE_TYPE, 2, &ife->eth_type))
+		goto nla_put_failure;
+
+	if (dump_metalist(skb, ife)) {
+		/*ignore failure to dump metalist */
+		pr_info("Failed to dump metalist\n");
+	}
+
+	return skb->len;
+
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
+		       u16 metaid, u16 mlen, void *mdata)
+{
+	struct tcf_meta_info *e;
+
+	/* XXX: use hash to speed up */
+	list_for_each_entry(e, &ife->metalist, metalist) {
+		if (metaid == e->metaid) {
+			if (e->ops) {
+				/* We check for decode presence already */
+				return e->ops->decode(skb, mdata, mlen);
+			}
+		}
+	}
+
+	return 0;
+}
+
+struct ifeheadr {
+	__be16 metalen;
+	u8 tlv_data[];
+};
+
+struct meta_tlvhdr {
+	__be16 type;
+	__be16 len;
+};
+
+static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_ife_info *ife = a->priv;
+	int action = ife->tcf_action;
+	struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
+	u16 ifehdrln = ifehdr->metalen;
+	struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
+
+	spin_lock(&ife->tcf_lock);
+	bstats_update(&ife->tcf_bstats, skb);
+	ife->tcf_tm.lastuse = jiffies;
+	spin_unlock(&ife->tcf_lock);
+
+	ifehdrln = ntohs(ifehdrln);
+	if (unlikely(!pskb_may_pull(skb, ifehdrln))) {
+		spin_lock(&ife->tcf_lock);
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
+	}
+
+	skb_set_mac_header(skb, ifehdrln);
+	__skb_pull(skb, ifehdrln);
+	skb->protocol = eth_type_trans(skb, skb->dev);
+	ifehdrln -= IFE_METAHDRLEN;
+
+	while (ifehdrln > 0) {
+		u8 *tlvdata = (u8 *)tlv;
+		u16 mtype = tlv->type;
+		u16 mlen = tlv->len;
+
+		mtype = ntohs(mtype);
+		mlen = ntohs(mlen);
+
+		if (find_decode_metaid(skb, ife, mtype, (mlen - 4),
+				       (void *)(tlvdata + 4))) {
+			/* abuse overlimits to count when we receive metadata
+			 * but dont have an ops for it
+			 */
+			pr_info_ratelimited("Unknown metaid %d alnlen %d\n",
+					    mtype, mlen);
+			ife->tcf_qstats.overlimits++;
+		}
+
+		tlvdata += mlen;
+		ifehdrln -= mlen;
+		tlv = (struct meta_tlvhdr *)tlvdata;
+	}
+
+	skb_reset_network_header(skb);
+	return action;
+}
+
+/*XXX: check if we can do this at install time instead of current
+ * send data path
+**/
+static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife)
+{
+	struct tcf_meta_info *e, *n;
+	int tot_run_sz = 0, run_sz = 0;
+
+	list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+		if (e->ops->check_presence) {
+			run_sz = e->ops->check_presence(skb, e);
+			tot_run_sz += run_sz;
+		}
+	}
+
+	return tot_run_sz;
+}
+
+static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
+			  struct tcf_result *res)
+{
+	struct tcf_ife_info *ife = a->priv;
+	int action = ife->tcf_action;
+	struct ethhdr *oethh;	/* outer ether header */
+	struct ethhdr *iethh;	/* inner eth header */
+	struct tcf_meta_info *e;
+	/*
+	   OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
+	   where ORIGDATA = original ethernet header ...
+	 */
+	u16 metalen = ife_get_sz(skb, ife);
+	int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
+	unsigned int skboff = skb->dev->hard_header_len;
+	u32 at = G_TC_AT(skb->tc_verd);
+	int new_len = skb->len + hdrm;
+	bool exceed_mtu = false;
+	int err;
+
+	if (at & AT_EGRESS) {
+		if (new_len > skb->dev->mtu)
+			exceed_mtu = true;
+	}
+
+	spin_lock(&ife->tcf_lock);
+	bstats_update(&ife->tcf_bstats, skb);
+	ife->tcf_tm.lastuse = jiffies;
+
+	if (!metalen) {		/* no metadata to send */
+		/* abuse overlimits to count when we allow packet
+		 * with no metadata
+		 */
+		ife->tcf_qstats.overlimits++;
+		spin_unlock(&ife->tcf_lock);
+		return action;
+	}
+	/* could be stupid policy setup or mtu config
+	 * so lets be conservative.. */
+	if ((action == TC_ACT_SHOT) || exceed_mtu) {
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
+	}
+
+	iethh = eth_hdr(skb);
+
+	err = skb_cow_head(skb, hdrm);
+	if (unlikely(err)) {
+		ife->tcf_qstats.drops++;
+		spin_unlock(&ife->tcf_lock);
+		return TC_ACT_SHOT;
+	}
+
+	if (!(at & AT_EGRESS))
+		skb_push(skb, skb->dev->hard_header_len);
+
+	__skb_push(skb, hdrm);
+	memcpy(skb->data, iethh, skb->mac_len);
+	skb_reset_mac_header(skb);
+	oethh = eth_hdr(skb);
+
+	/*total metadata length */
+	metalen += IFE_METAHDRLEN;
+	metalen = htons(metalen);
+	memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN);
+	skboff += IFE_METAHDRLEN;
+
+	/* XXX: we dont have a clever way of telling encode to
+	 * not repeat some of the computations that are done by
+	 * ops->presence_check...
+	 */
+	list_for_each_entry(e, &ife->metalist, metalist) {
+		if (e->ops->encode) {
+			err = e->ops->encode(skb, (void *)(skb->data + skboff),
+					     e);
+		}
+		if (err < 0) {
+			/* too corrupt to keep around if overwritten */
+			ife->tcf_qstats.drops++;
+			spin_unlock(&ife->tcf_lock);
+			return TC_ACT_SHOT;
+		}
+		skboff += err;
+	}
+
+	if (!is_zero_ether_addr(ife->eth_src))
+		ether_addr_copy(oethh->h_source, ife->eth_src);
+	else
+		ether_addr_copy(oethh->h_source, iethh->h_source);
+	if (!is_zero_ether_addr(ife->eth_dst))
+		ether_addr_copy(oethh->h_dest, ife->eth_dst);
+	else
+		ether_addr_copy(oethh->h_dest, iethh->h_dest);
+	oethh->h_proto = htons(ife->eth_type);
+
+	if (!(at & AT_EGRESS))
+		skb_pull(skb, skb->dev->hard_header_len);
+
+	spin_unlock(&ife->tcf_lock);
+
+	return action;
+}
+
+static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
+		       struct tcf_result *res)
+{
+	struct tcf_ife_info *ife = a->priv;
+
+	if (ife->flags & IFE_ENCODE)
+		return tcf_ife_encode(skb, a, res);
+
+	if (!(ife->flags & IFE_ENCODE))
+		return tcf_ife_decode(skb, a, res);
+
+	pr_info_ratelimited("unknown failure(policy neither de/encode\n");
+	spin_lock(&ife->tcf_lock);
+	bstats_update(&ife->tcf_bstats, skb);
+	ife->tcf_tm.lastuse = jiffies;
+	ife->tcf_qstats.drops++;
+	spin_unlock(&ife->tcf_lock);
+
+	return TC_ACT_SHOT;
+}
+
+static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
+			  struct netlink_callback *cb, int type,
+			  struct tc_action *a)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_ife_search(struct net *net, struct tc_action *a, u32 index)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ife_ops = {
+	.kind = "ife",
+	.type = TCA_ACT_IFE,
+	.owner = THIS_MODULE,
+	.act = tcf_ife_act,
+	.dump = tcf_ife_dump,
+	.cleanup = tcf_ife_cleanup,
+	.init = tcf_ife_init,
+	.walk = tcf_ife_walker,
+	.lookup = tcf_ife_search,
+};
+
+static __net_init int ife_init_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	return tc_action_net_init(tn, &act_ife_ops, IFE_TAB_MASK);
+}
+
+static void __net_exit ife_exit_net(struct net *net)
+{
+	struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+	tc_action_net_exit(tn);
+}
+
+static struct pernet_operations ife_net_ops = {
+	.init = ife_init_net,
+	.exit = ife_exit_net,
+	.id   = &ife_net_id,
+	.size = sizeof(struct tc_action_net),
+};
+
+static int __init ife_init_module(void)
+{
+	return tcf_register_action(&act_ife_ops, &ife_net_ops);
+}
+
+static void __exit ife_cleanup_module(void)
+{
+	tcf_unregister_action(&act_ife_ops, &ife_net_ops);
+}
+
+module_init(ife_init_module);
+module_exit(ife_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2015)");
+MODULE_DESCRIPTION("Inter-FE LFB action");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 01d01d69192e417447dee97891d670804bedd2c8 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 1 Mar 2016 17:54:37 +1100
Subject: KVM: PPC: Reserve KVM_CAP_SPAPR_TCE_64 capability number

This adds a capability number for 64-bit TCE tables support.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 9da905157cee..8ce5f643d078 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -850,6 +850,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
 #define KVM_CAP_HYPERV_SYNIC 123
 #define KVM_CAP_S390_RI 124
+#define KVM_CAP_SPAPR_TCE_64 125
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
cgit v1.2.3


From 58ded4201ff028b15f6b317228faa5f154a0663f Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Tue, 1 Mar 2016 17:54:40 +1100
Subject: KVM: PPC: Add support for 64bit TCE windows

The existing KVM_CREATE_SPAPR_TCE only supports 32bit windows which is not
enough for directly mapped windows as the guest can get more than 4GB.

This adds KVM_CREATE_SPAPR_TCE_64 ioctl and advertises it
via KVM_CAP_SPAPR_TCE_64 capability. The table size is checked against
the locked memory limit.

Since 64bit windows are to support Dynamic DMA windows (DDW), let's add
@bus_offset and @page_shift which are also required by DDW.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 Documentation/virtual/kvm/api.txt   | 32 ++++++++++++++++++++++++++++++++
 arch/powerpc/include/asm/kvm_ppc.h  |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h |  9 +++++++++
 arch/powerpc/kvm/book3s_64_vio.c    | 10 +++++++---
 arch/powerpc/kvm/powerpc.c          | 25 ++++++++++++++++++++++++-
 include/uapi/linux/kvm.h            |  2 ++
 6 files changed, 75 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index da3943586a2b..bc78652b0d07 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3060,6 +3060,38 @@ an implementation for these despite the in kernel acceleration.
 
 This capability is always enabled.
 
+4.98 KVM_CREATE_SPAPR_TCE_64
+
+Capability: KVM_CAP_SPAPR_TCE_64
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce_64 (in)
+Returns: file descriptor for manipulating the created TCE table
+
+This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit
+windows, described in 4.62 KVM_CREATE_SPAPR_TCE
+
+This capability uses extended struct in ioctl interface:
+
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+	__u64 liobn;
+	__u32 page_shift;
+	__u32 flags;
+	__u64 offset;	/* in pages */
+	__u64 size; 	/* in pages */
+};
+
+The aim of extension is to support an additional bigger DMA window with
+a variable page size.
+KVM_CREATE_SPAPR_TCE_64 receives a 64bit window size, an IOMMU page shift and
+a bus offset of the corresponding DMA window, @size and @offset are numbers
+of IOMMU pages.
+
+@flags are not used at the moment.
+
+The rest of functionality is identical to KVM_CREATE_SPAPR_TCE.
+
 5. The kvm_run structure
 ------------------------
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 197a8aca2871..2544edabe7f3 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -165,7 +165,7 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-				struct kvm_create_spapr_tce *args);
+				struct kvm_create_spapr_tce_64 *args);
 extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
 		struct kvm_vcpu *vcpu, unsigned long liobn);
 extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index ab4d4732c492..c93cf35ce379 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -333,6 +333,15 @@ struct kvm_create_spapr_tce {
 	__u32 window_size;
 };
 
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+	__u64 liobn;
+	__u32 page_shift;
+	__u32 flags;
+	__u64 offset;	/* in pages */
+	__u64 size;	/* in pages */
+};
+
 /* for KVM_ALLOCATE_RMA */
 struct kvm_allocate_rma {
 	__u64 rma_size;
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 61cbc449d0a8..2c2d1030843a 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -147,20 +147,23 @@ static const struct file_operations kvm_spapr_tce_fops = {
 };
 
 long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-				   struct kvm_create_spapr_tce *args)
+				   struct kvm_create_spapr_tce_64 *args)
 {
 	struct kvmppc_spapr_tce_table *stt = NULL;
 	unsigned long npages, size;
 	int ret = -ENOMEM;
 	int i;
 
+	if (!args->size)
+		return -EINVAL;
+
 	/* Check this LIOBN hasn't been previously allocated */
 	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
 		if (stt->liobn == args->liobn)
 			return -EBUSY;
 	}
 
-	size = args->window_size >> IOMMU_PAGE_SHIFT_4K;
+	size = args->size;
 	npages = kvmppc_tce_pages(size);
 	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
 	if (ret) {
@@ -174,7 +177,8 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 		goto fail;
 
 	stt->liobn = args->liobn;
-	stt->page_shift = IOMMU_PAGE_SHIFT_4K;
+	stt->page_shift = args->page_shift;
+	stt->offset = args->offset;
 	stt->size = size;
 	stt->kvm = kvm;
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 9258675e2ff7..19aa59b0850c 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -33,6 +33,7 @@
 #include <asm/tlbflush.h>
 #include <asm/cputhreads.h>
 #include <asm/irqflags.h>
+#include <asm/iommu.h>
 #include "timing.h"
 #include "irq.h"
 #include "../mm/mmu_decl.h"
@@ -519,6 +520,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_SPAPR_TCE:
+	case KVM_CAP_SPAPR_TCE_64:
 	case KVM_CAP_PPC_ALLOC_HTAB:
 	case KVM_CAP_PPC_RTAS:
 	case KVM_CAP_PPC_FIXUP_HCALL:
@@ -1344,13 +1346,34 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 #ifdef CONFIG_PPC_BOOK3S_64
+	case KVM_CREATE_SPAPR_TCE_64: {
+		struct kvm_create_spapr_tce_64 create_tce_64;
+
+		r = -EFAULT;
+		if (copy_from_user(&create_tce_64, argp, sizeof(create_tce_64)))
+			goto out;
+		if (create_tce_64.flags) {
+			r = -EINVAL;
+			goto out;
+		}
+		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64);
+		goto out;
+	}
 	case KVM_CREATE_SPAPR_TCE: {
 		struct kvm_create_spapr_tce create_tce;
+		struct kvm_create_spapr_tce_64 create_tce_64;
 
 		r = -EFAULT;
 		if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
 			goto out;
-		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
+
+		create_tce_64.liobn = create_tce.liobn;
+		create_tce_64.page_shift = IOMMU_PAGE_SHIFT_4K;
+		create_tce_64.offset = 0;
+		create_tce_64.size = create_tce.window_size >>
+				IOMMU_PAGE_SHIFT_4K;
+		create_tce_64.flags = 0;
+		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64);
 		goto out;
 	}
 	case KVM_PPC_GET_SMMU_INFO: {
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 8ce5f643d078..b06208b2669c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1143,6 +1143,8 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_PPC_ALLOC_HTAB */
 #define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
 #define KVM_CREATE_SPAPR_TCE	  _IOW(KVMIO,  0xa8, struct kvm_create_spapr_tce)
+#define KVM_CREATE_SPAPR_TCE_64	  _IOW(KVMIO,  0xa8, \
+				       struct kvm_create_spapr_tce_64)
 /* Available with KVM_CAP_RMA */
 #define KVM_ALLOCATE_RMA	  _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 /* Available with KVM_CAP_PPC_HTAB_FD */
-- 
cgit v1.2.3


From 592002f55e6e0b198a01d4a9315f74ddfea1c403 Mon Sep 17 00:00:00 2001
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 24 Feb 2016 17:07:27 +0200
Subject: virtio_blk: VIRTIO_BLK_F_WCE->VIRTIO_BLK_F_FLUSH

Latest virtio spec says the feature bit name is VIRTIO_BLK_F_FLUSH,
VIRTIO_BLK_F_WCE is the legacy name.  virtio blk header says exactly the
reverse - fix that and update driver code to match.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/block/virtio_blk.c      | 11 ++++++++---
 include/uapi/linux/virtio_blk.h |  6 +++---
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 6ca35495a5be..28cff0d23d82 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -477,8 +477,13 @@ static int virtblk_get_cache_mode(struct virtio_device *vdev)
 	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
 				   struct virtio_blk_config, wce,
 				   &writeback);
+
+	/*
+	 * If WCE is not configurable and flush is not available,
+	 * assume no writeback cache is in use.
+	 */
 	if (err)
-		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE);
+		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH);
 
 	return writeback;
 }
@@ -833,14 +838,14 @@ static const struct virtio_device_id id_table[] = {
 static unsigned int features_legacy[] = {
 	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
 	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
-	VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
+	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
 	VIRTIO_BLK_F_MQ,
 }
 ;
 static unsigned int features[] = {
 	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
 	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
-	VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
+	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
 	VIRTIO_BLK_F_MQ,
 };
 
diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
index 19c66fcbab8a..9ebe4d968dd5 100644
--- a/include/uapi/linux/virtio_blk.h
+++ b/include/uapi/linux/virtio_blk.h
@@ -43,11 +43,11 @@
 #ifndef VIRTIO_BLK_NO_LEGACY
 #define VIRTIO_BLK_F_BARRIER	0	/* Does host support barriers? */
 #define VIRTIO_BLK_F_SCSI	7	/* Supports scsi command passthru */
-#define VIRTIO_BLK_F_WCE	9	/* Writeback mode enabled after reset */
+#define VIRTIO_BLK_F_FLUSH	9	/* Flush command supported */
 #define VIRTIO_BLK_F_CONFIG_WCE	11	/* Writeback mode available in config */
 #ifndef __KERNEL__
-/* Old (deprecated) name for VIRTIO_BLK_F_WCE. */
-#define VIRTIO_BLK_F_FLUSH VIRTIO_BLK_F_WCE
+/* Old (deprecated) name for VIRTIO_BLK_F_FLUSH. */
+#define VIRTIO_BLK_F_WCE VIRTIO_BLK_F_FLUSH
 #endif
 #endif /* !VIRTIO_BLK_NO_LEGACY */
 
-- 
cgit v1.2.3


From 8a6bf5da1aefdafd60b73d9122c7af9fd2d7bb9c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 1 Mar 2016 19:55:14 +0100
Subject: netfilter: nft_masq: support port range

Complete masquerading support by allowing port range selection.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_masq.h         |  4 ++-
 include/uapi/linux/netfilter/nf_tables.h |  4 +++
 net/ipv4/netfilter/nft_masq_ipv4.c       |  7 ++++-
 net/ipv6/netfilter/nft_masq_ipv6.c       |  7 ++++-
 net/netfilter/nft_masq.c                 | 51 +++++++++++++++++++++++++-------
 5 files changed, 59 insertions(+), 14 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/net/netfilter/nft_masq.h b/include/net/netfilter/nft_masq.h
index e2a518b60e19..a3f3c11b2526 100644
--- a/include/net/netfilter/nft_masq.h
+++ b/include/net/netfilter/nft_masq.h
@@ -2,7 +2,9 @@
 #define _NFT_MASQ_H_
 
 struct nft_masq {
-	u32	flags;
+	u32			flags;
+	enum nft_registers      sreg_proto_min:8;
+	enum nft_registers      sreg_proto_max:8;
 };
 
 extern const struct nla_policy nft_masq_policy[];
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index b19be0a098c0..eeffde196f80 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -951,10 +951,14 @@ enum nft_nat_attributes {
  * enum nft_masq_attributes - nf_tables masquerade expression attributes
  *
  * @NFTA_MASQ_FLAGS: NAT flags (see NF_NAT_RANGE_* in linux/netfilter/nf_nat.h) (NLA_U32)
+ * @NFTA_MASQ_REG_PROTO_MIN: source register of proto range start (NLA_U32: nft_registers)
+ * @NFTA_MASQ_REG_PROTO_MAX: source register of proto range end (NLA_U32: nft_registers)
  */
 enum nft_masq_attributes {
 	NFTA_MASQ_UNSPEC,
 	NFTA_MASQ_FLAGS,
+	NFTA_MASQ_REG_PROTO_MIN,
+	NFTA_MASQ_REG_PROTO_MAX,
 	__NFTA_MASQ_MAX
 };
 #define NFTA_MASQ_MAX		(__NFTA_MASQ_MAX - 1)
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index b72ffc58e255..51ced81b616c 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -25,7 +25,12 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
-
+	if (priv->sreg_proto_min) {
+		range.min_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_min];
+		range.max_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_max];
+	}
 	regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,
 						    &range, pkt->out);
 }
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index cd1ac1637a05..9597ffb74077 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -26,7 +26,12 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
 
 	memset(&range, 0, sizeof(range));
 	range.flags = priv->flags;
-
+	if (priv->sreg_proto_min) {
+		range.min_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_min];
+		range.max_proto.all =
+			*(__be16 *)&regs->data[priv->sreg_proto_max];
+	}
 	regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out);
 }
 
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 9aea747b43ea..81b5ad6165ac 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -17,7 +17,9 @@
 #include <net/netfilter/nft_masq.h>
 
 const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = {
-	[NFTA_MASQ_FLAGS]	= { .type = NLA_U32 },
+	[NFTA_MASQ_FLAGS]		= { .type = NLA_U32 },
+	[NFTA_MASQ_REG_PROTO_MIN]	= { .type = NLA_U32 },
+	[NFTA_MASQ_REG_PROTO_MAX]	= { .type = NLA_U32 },
 };
 EXPORT_SYMBOL_GPL(nft_masq_policy);
 
@@ -40,6 +42,7 @@ int nft_masq_init(const struct nft_ctx *ctx,
 		  const struct nft_expr *expr,
 		  const struct nlattr * const tb[])
 {
+	u32 plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
 	struct nft_masq *priv = nft_expr_priv(expr);
 	int err;
 
@@ -47,12 +50,32 @@ int nft_masq_init(const struct nft_ctx *ctx,
 	if (err)
 		return err;
 
-	if (tb[NFTA_MASQ_FLAGS] == NULL)
-		return 0;
-
-	priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
-	if (priv->flags & ~NF_NAT_RANGE_MASK)
-		return -EINVAL;
+	if (tb[NFTA_MASQ_FLAGS]) {
+		priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
+		if (priv->flags & ~NF_NAT_RANGE_MASK)
+			return -EINVAL;
+	}
+
+	if (tb[NFTA_MASQ_REG_PROTO_MIN]) {
+		priv->sreg_proto_min =
+			nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]);
+
+		err = nft_validate_register_load(priv->sreg_proto_min, plen);
+		if (err < 0)
+			return err;
+
+		if (tb[NFTA_MASQ_REG_PROTO_MAX]) {
+			priv->sreg_proto_max =
+				nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]);
+
+			err = nft_validate_register_load(priv->sreg_proto_max,
+							 plen);
+			if (err < 0)
+				return err;
+		} else {
+			priv->sreg_proto_max = priv->sreg_proto_min;
+		}
+	}
 
 	return 0;
 }
@@ -62,12 +85,18 @@ int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr)
 {
 	const struct nft_masq *priv = nft_expr_priv(expr);
 
-	if (priv->flags == 0)
-		return 0;
-
-	if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags)))
+	if (priv->flags != 0 &&
+	    nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags)))
 		goto nla_put_failure;
 
+	if (priv->sreg_proto_min) {
+		if (nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MIN,
+				      priv->sreg_proto_min) ||
+		    nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MAX,
+				      priv->sreg_proto_max))
+			goto nla_put_failure;
+	}
+
 	return 0;
 
 nla_put_failure:
-- 
cgit v1.2.3


From 87294e9db50f34ac002e32669b2ff74b72d63584 Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Mon, 29 Feb 2016 05:02:47 -0300
Subject: [media] media.h: use hex values for IF and AUDIO entities too

Make the base offset hexadecimal to simplify debugging since the base
addresses are hex too.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 include/uapi/linux/media.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 2e2ab13a90b0..79960aefd356 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -84,15 +84,15 @@ struct media_device_info {
  * It is a responsibility of the master/bridge drivers to create links
  * for MEDIA_ENT_F_IF_VID_DECODER and MEDIA_ENT_F_IF_AUD_DECODER.
  */
-#define MEDIA_ENT_F_IF_VID_DECODER	(MEDIA_ENT_F_BASE + 2001)
-#define MEDIA_ENT_F_IF_AUD_DECODER	(MEDIA_ENT_F_BASE + 2002)
+#define MEDIA_ENT_F_IF_VID_DECODER	(MEDIA_ENT_F_BASE + 0x02001)
+#define MEDIA_ENT_F_IF_AUD_DECODER	(MEDIA_ENT_F_BASE + 0x02002)
 
 /*
  * Audio Entity Functions
  */
-#define MEDIA_ENT_F_AUDIO_CAPTURE	(MEDIA_ENT_F_BASE + 3000)
-#define MEDIA_ENT_F_AUDIO_PLAYBACK	(MEDIA_ENT_F_BASE + 3001)
-#define MEDIA_ENT_F_AUDIO_MIXER		(MEDIA_ENT_F_BASE + 3002)
+#define MEDIA_ENT_F_AUDIO_CAPTURE	(MEDIA_ENT_F_BASE + 0x03000)
+#define MEDIA_ENT_F_AUDIO_PLAYBACK	(MEDIA_ENT_F_BASE + 0x03001)
+#define MEDIA_ENT_F_AUDIO_MIXER		(MEDIA_ENT_F_BASE + 0x03002)
 
 /*
  * Connectors
-- 
cgit v1.2.3


From 5d8d8db851ef81337e7026b32a9d5a9cfb2271d5 Mon Sep 17 00:00:00 2001
From: Aviv Greenberg <avivgr@gmail.com>
Date: Wed, 3 Feb 2016 15:08:52 -0200
Subject: [media] UVC: Add support for R200 depth camera

Add support for Intel R200 depth camera in uvc driver.
This includes adding new uvc GUIDs for the new pixel formats,
adding new V4L pixel format definition to user api headers,
and updating the uvc driver GUID-to-4cc tables with the new formats.

Tested-by: Greenberg, Aviv D <aviv.d.greenberg@intel.com>
Signed-off-by: Aviv Greenberg <aviv.d.greenberg@intel.com>
Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Signed-off-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 drivers/media/usb/uvc/uvc_driver.c | 20 ++++++++++++++++++++
 drivers/media/usb/uvc/uvcvideo.h   | 12 ++++++++++++
 include/uapi/linux/videodev2.h     |  3 +++
 3 files changed, 35 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/usb/uvc/uvc_driver.c b/drivers/media/usb/uvc/uvc_driver.c
index 4e7148815a78..451e84e962e2 100644
--- a/drivers/media/usb/uvc/uvc_driver.c
+++ b/drivers/media/usb/uvc/uvc_driver.c
@@ -148,6 +148,26 @@ static struct uvc_format_desc uvc_fmts[] = {
 		.guid		= UVC_GUID_FORMAT_H264,
 		.fcc		= V4L2_PIX_FMT_H264,
 	},
+	{
+		.name		= "Greyscale 8 L/R (Y8I)",
+		.guid		= UVC_GUID_FORMAT_Y8I,
+		.fcc		= V4L2_PIX_FMT_Y8I,
+	},
+	{
+		.name		= "Greyscale 12 L/R (Y12I)",
+		.guid		= UVC_GUID_FORMAT_Y12I,
+		.fcc		= V4L2_PIX_FMT_Y12I,
+	},
+	{
+		.name		= "Depth data 16-bit (Z16)",
+		.guid		= UVC_GUID_FORMAT_Z16,
+		.fcc		= V4L2_PIX_FMT_Z16,
+	},
+	{
+		.name		= "Bayer 10-bit (SRGGB10P)",
+		.guid		= UVC_GUID_FORMAT_RW10,
+		.fcc		= V4L2_PIX_FMT_SRGGB10P,
+	},
 };
 
 /* ------------------------------------------------------------------------
diff --git a/drivers/media/usb/uvc/uvcvideo.h b/drivers/media/usb/uvc/uvcvideo.h
index f0f2391e1b43..7e4d3eea371b 100644
--- a/drivers/media/usb/uvc/uvcvideo.h
+++ b/drivers/media/usb/uvc/uvcvideo.h
@@ -119,6 +119,18 @@
 #define UVC_GUID_FORMAT_H264 \
 	{ 'H',  '2',  '6',  '4', 0x00, 0x00, 0x10, 0x00, \
 	 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}
+#define UVC_GUID_FORMAT_Y8I \
+	{ 'Y',  '8',  'I',  ' ', 0x00, 0x00, 0x10, 0x00, \
+	 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}
+#define UVC_GUID_FORMAT_Y12I \
+	{ 'Y',  '1',  '2',  'I', 0x00, 0x00, 0x10, 0x00, \
+	 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}
+#define UVC_GUID_FORMAT_Z16 \
+	{ 'Z',  '1',  '6',  ' ', 0x00, 0x00, 0x10, 0x00, \
+	 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}
+#define UVC_GUID_FORMAT_RW10 \
+	{ 'R',  'W',  '1',  '0', 0x00, 0x00, 0x10, 0x00, \
+	 0x80, 0x00, 0x00, 0xaa, 0x00, 0x38, 0x9b, 0x71}
 
 /* ------------------------------------------------------------------------
  * Driver specific constants.
diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h
index 466458422385..e895975c5b0e 100644
--- a/include/uapi/linux/videodev2.h
+++ b/include/uapi/linux/videodev2.h
@@ -625,6 +625,9 @@ struct v4l2_pix_format {
 #define V4L2_PIX_FMT_JPGL	v4l2_fourcc('J', 'P', 'G', 'L') /* JPEG-Lite */
 #define V4L2_PIX_FMT_SE401      v4l2_fourcc('S', '4', '0', '1') /* se401 janggu compressed rgb */
 #define V4L2_PIX_FMT_S5C_UYVY_JPG v4l2_fourcc('S', '5', 'C', 'I') /* S5C73M3 interleaved UYVY/JPEG */
+#define V4L2_PIX_FMT_Y8I      v4l2_fourcc('Y', '8', 'I', ' ') /* Greyscale 8-bit L/R interleaved */
+#define V4L2_PIX_FMT_Y12I     v4l2_fourcc('Y', '1', '2', 'I') /* Greyscale 12-bit L/R interleaved */
+#define V4L2_PIX_FMT_Z16      v4l2_fourcc('Z', '1', '6', ' ') /* Depth data 16-bit */
 
 /* SDR formats - used only for Software Defined Radio devices */
 #define V4L2_SDR_FMT_CU8          v4l2_fourcc('C', 'U', '0', '8') /* IQ u8 */
-- 
cgit v1.2.3


From 0629e991a25a9b56a70f9f0c327aad8ae1471dbf Mon Sep 17 00:00:00 2001
From: Sakari Ailus <sakari.ailus@iki.fi>
Date: Mon, 22 Feb 2016 17:47:04 -0300
Subject: [media] media: Move media_get_uptr() macro out of the media.h user
 space header

The media_get_uptr() macro is mostly useful only for the IOCTL handling
code in media-device.c so move it there.

Signed-off-by: Sakari Ailus <sakari.ailus@linux.intel.com>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 drivers/media/media-device.c | 5 +++++
 include/uapi/linux/media.h   | 5 -----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/media/media-device.c b/drivers/media/media-device.c
index a2353ecab687..c89dfef0c4c5 100644
--- a/drivers/media/media-device.c
+++ b/drivers/media/media-device.c
@@ -40,6 +40,11 @@
  * Userspace API
  */
 
+static inline void __user *media_get_uptr(__u64 arg)
+{
+	return (void __user *)(uintptr_t)arg;
+}
+
 static int media_device_open(struct file *filp)
 {
 	return 0;
diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 79960aefd356..71fb8e3dce06 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -375,11 +375,6 @@ struct media_v2_topology {
 	__u64 ptr_links;
 };
 
-static inline void __user *media_get_uptr(__u64 arg)
-{
-	return (void __user *)(uintptr_t)arg;
-}
-
 /* ioctls */
 
 #define MEDIA_IOC_DEVICE_INFO		_IOWR('|', 0x00, struct media_device_info)
-- 
cgit v1.2.3


From 64bd1971decc2c2f81ca16370f8ee09bdcb1b49d Mon Sep 17 00:00:00 2001
From: Hans Verkuil <hverkuil@xs4all.nl>
Date: Thu, 3 Mar 2016 10:47:33 -0300
Subject: [media] media.h: always start with 1 for the audio entities

Start the audio defines with BASE + 0x03001 instead of 0x03000. This is consistent
with the other defines, and I think it is good practice not to start with 0, just in
case we want to do something like (id & 0xfff) in the future and treat the value 0
as a special case.

Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com>
Suggested-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
---
 include/uapi/linux/media.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/media.h b/include/uapi/linux/media.h
index 71fb8e3dce06..52f5834dcc41 100644
--- a/include/uapi/linux/media.h
+++ b/include/uapi/linux/media.h
@@ -90,9 +90,9 @@ struct media_device_info {
 /*
  * Audio Entity Functions
  */
-#define MEDIA_ENT_F_AUDIO_CAPTURE	(MEDIA_ENT_F_BASE + 0x03000)
-#define MEDIA_ENT_F_AUDIO_PLAYBACK	(MEDIA_ENT_F_BASE + 0x03001)
-#define MEDIA_ENT_F_AUDIO_MIXER		(MEDIA_ENT_F_BASE + 0x03002)
+#define MEDIA_ENT_F_AUDIO_CAPTURE	(MEDIA_ENT_F_BASE + 0x03001)
+#define MEDIA_ENT_F_AUDIO_PLAYBACK	(MEDIA_ENT_F_BASE + 0x03002)
+#define MEDIA_ENT_F_AUDIO_MIXER		(MEDIA_ENT_F_BASE + 0x03003)
 
 /*
  * Connectors
-- 
cgit v1.2.3


From 446fa3a95df1e8b78f25e1babc41e46edd200821 Mon Sep 17 00:00:00 2001
From: John Youn <John.Youn@synopsys.com>
Date: Fri, 5 Feb 2016 17:05:12 -0800
Subject: usb: ch9: Add size macro for SSP dev cap descriptor

The SuperspeedPlus Device Capability Descriptor has a variable size
depending on the number of sublink speed attributes.

This patch adds a macro to calculate that size. The macro takes one
argument, the Sublink Speed Attribute Count (SSAC) as reported by the
descriptor in bmAttributes[4:0].

See USB 3.1 9.6.2.5, Table 9-19.

Signed-off-by: John Youn <johnyoun@synopsys.com>
Signed-off-by: Felipe Balbi <balbi@kernel.org>
---
 include/uapi/linux/usb/ch9.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index 252ac16635dc..28bee8d44735 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -923,6 +923,12 @@ struct usb_ptm_cap_descriptor {
 	__u8  bDevCapabilityType;
 } __attribute__((packed));
 
+/*
+ * The size of the descriptor for the Sublink Speed Attribute Count
+ * (SSAC) specified in bmAttributes[4:0].
+ */
+#define USB_DT_USB_SSP_CAP_SIZE(ssac)	(16 + ssac * 4)
+
 /*-------------------------------------------------------------------------*/
 
 /* USB_DT_WIRELESS_ENDPOINT_COMP:  companion descriptor associated with
-- 
cgit v1.2.3


From 346dbc69933086312d9e76bb68ba04e2c82c9b13 Mon Sep 17 00:00:00 2001
From: Li Jun <jun.li@nxp.com>
Date: Fri, 19 Feb 2016 10:04:41 +0800
Subject: usb: add OTG status selector definition for HNP polling

A host is required to use the GetStatus command, with wIndex set to the
OTG status selector(F000H) to request the Host request flag from the
peripheral.

Acked-by: Peter Chen <peter.chen@freescale.com>
Signed-off-by: Li Jun <jun.li@nxp.com>
Signed-off-by: Felipe Balbi <balbi@kernel.org>
---
 include/uapi/linux/usb/ch9.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index 28bee8d44735..06d6c6228a7a 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -708,6 +708,7 @@ struct usb_otg20_descriptor {
 #define USB_OTG_HNP		(1 << 1)	/* swap host/device roles */
 #define USB_OTG_ADP		(1 << 2)	/* support ADP */
 
+#define OTG_STS_SELECTOR	0xF000		/* OTG status selector */
 /*-------------------------------------------------------------------------*/
 
 /* USB_DT_DEBUG:  for special highspeed devices, replacing serial console */
-- 
cgit v1.2.3


From 97be7ebe53915af504fb491fb99f064c7cf3cb09 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 3 Mar 2016 16:04:01 +0100
Subject: vfs: add the RWF_HIPRI flag for preadv2/pwritev2

This adds a flag that tells the file system that this is a high priority
request for which it's worth to poll the hardware.  The flag is purely
advisory and can be ignored if not supported.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Stephen Bates <stephen.bates@pmcs.com>
Tested-by: Stephen Bates <stephen.bates@pmcs.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/read_write.c         | 6 ++++--
 include/linux/fs.h      | 1 +
 include/uapi/linux/fs.h | 3 +++
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/read_write.c b/fs/read_write.c
index e9c9e2a667ce..07c53db04ec1 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -697,10 +697,12 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
 	struct kiocb kiocb;
 	ssize_t ret;
 
-	if (flags)
+	if (flags & ~RWF_HIPRI)
 		return -EOPNOTSUPP;
 
 	init_sync_kiocb(&kiocb, filp);
+	if (flags & RWF_HIPRI)
+		kiocb.ki_flags |= IOCB_HIPRI;
 	kiocb.ki_pos = *ppos;
 
 	ret = fn(&kiocb, iter);
@@ -715,7 +717,7 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
 {
 	ssize_t ret = 0;
 
-	if (flags)
+	if (flags & ~RWF_HIPRI)
 		return -EOPNOTSUPP;
 
 	while (iov_iter_count(iter)) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6ec87964644f..337de88ff50f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -320,6 +320,7 @@ struct writeback_control;
 #define IOCB_EVENTFD		(1 << 0)
 #define IOCB_APPEND		(1 << 1)
 #define IOCB_DIRECT		(1 << 2)
+#define IOCB_HIPRI		(1 << 3)
 
 struct kiocb {
 	struct file		*ki_filp;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 41e0433b4a83..847c656729f8 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -305,4 +305,7 @@ struct fsxattr {
 #define SYNC_FILE_RANGE_WRITE		2
 #define SYNC_FILE_RANGE_WAIT_AFTER	4
 
+/* flags for preadv2/pwritev2: */
+#define RWF_HIPRI			0x00000001 /* high priority request, poll if possible */
+
 #endif /* _UAPI_LINUX_FS_H */
-- 
cgit v1.2.3


From b5d3755a22e0cc4c369c0985aef0c52c2477c1e7 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 4 Mar 2016 11:52:16 +0100
Subject: uapi: define DIV_ROUND_UP for userland

DIV_ROUND_UP is defined in linux/kernel.h only for the kernel.
When ethtool.h is included by a userland app, we got the following error:

include/linux/ethtool.h:1218:8: error: variably modified 'queue_mask' at file scope
  __u32 queue_mask[DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
        ^

Let's add a common definition in uapi and use it everywhere.

Fixes: ac2c7ad0e5d6 ("net/ethtool: introduce a new ioctl for per queue setting")
CC: Kan Liang <kan.liang@intel.com>
Suggested-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/kernel.h       | 2 +-
 include/uapi/linux/ethtool.h | 3 ++-
 include/uapi/linux/kernel.h  | 1 +
 include/uapi/linux/mroute6.h | 9 ++-------
 4 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f31638c6e873..ac1923957236 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -64,7 +64,7 @@
 #define round_down(x, y) ((x) & ~__round_mask(x, y))
 
 #define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define DIV_ROUND_UP __KERNEL_DIV_ROUND_UP
 #define DIV_ROUND_UP_ULL(ll,d) \
 	({ unsigned long long _tmp = (ll)+(d)-1; do_div(_tmp, d); _tmp; })
 
diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 37fd6dc33de4..9c22249ebf35 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -13,6 +13,7 @@
 #ifndef _UAPI_LINUX_ETHTOOL_H
 #define _UAPI_LINUX_ETHTOOL_H
 
+#include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/if_ether.h>
 
@@ -1215,7 +1216,7 @@ enum ethtool_sfeatures_retval_bits {
 struct ethtool_per_queue_op {
 	__u32	cmd;
 	__u32	sub_command;
-	__u32	queue_mask[DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
+	__u32	queue_mask[__KERNEL_DIV_ROUND_UP(MAX_NUM_QUEUE, 32)];
 	char	data[];
 };
 
diff --git a/include/uapi/linux/kernel.h b/include/uapi/linux/kernel.h
index 321e399457f5..466073f0ce46 100644
--- a/include/uapi/linux/kernel.h
+++ b/include/uapi/linux/kernel.h
@@ -9,5 +9,6 @@
 #define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
 #define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
 
+#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
 
 #endif /* _UAPI_LINUX_KERNEL_H */
diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h
index ce91215cf7e6..5062fb5751e1 100644
--- a/include/uapi/linux/mroute6.h
+++ b/include/uapi/linux/mroute6.h
@@ -1,6 +1,7 @@
 #ifndef _UAPI__LINUX_MROUTE6_H
 #define _UAPI__LINUX_MROUTE6_H
 
+#include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/sockios.h>
 
@@ -46,14 +47,8 @@ typedef unsigned short mifi_t;
 typedef	__u32		if_mask;
 #define NIFBITS (sizeof(if_mask) * 8)        /* bits per mask */
 
-#if !defined(__KERNEL__)
-#if !defined(DIV_ROUND_UP)
-#define	DIV_ROUND_UP(x,y)	(((x) + ((y) - 1)) / (y))
-#endif
-#endif
-
 typedef struct if_set {
-	if_mask ifs_bits[DIV_ROUND_UP(IF_SETSIZE, NIFBITS)];
+	if_mask ifs_bits[__KERNEL_DIV_ROUND_UP(IF_SETSIZE, NIFBITS)];
 } if_set;
 
 #define IF_SET(n, p)    ((p)->ifs_bits[(n)/NIFBITS] |= (1 << ((n) % NIFBITS)))
-- 
cgit v1.2.3


From 14e2037902d65213842b4e40305ff54a64abbcb6 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Fri, 4 Mar 2016 11:52:19 +0100
Subject: ethtool.h: define INT_MAX for userland

INT_MAX needs limits.h in userland.
When ethtool.h is included by a userland app, we got the following error:

.../usr/include/linux/ethtool.h: In function 'ethtool_validate_speed':
.../usr/include/linux/ethtool.h:1471:18: error: 'INT_MAX' undeclared (first use in this function)
  return speed <= INT_MAX || speed == SPEED_UNKNOWN
                  ^

Fixes: e02564ee334a ("ethtool: make validate_speed accept all speeds between 0 and INT_MAX")
CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 9c22249ebf35..2835b07416b7 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -17,6 +17,10 @@
 #include <linux/types.h>
 #include <linux/if_ether.h>
 
+#ifndef __KERNEL__
+#include <limits.h> /* for INT_MAX */
+#endif
+
 /* All structures exposed to userland should be defined such that they
  * have the same layout for 32-bit and 64-bit userland.
  */
-- 
cgit v1.2.3


From d883f52e1f6d2eca8378e3795f333c1396943873 Mon Sep 17 00:00:00 2001
From: Reilly Grant <reillyg@chromium.org>
Date: Sun, 21 Feb 2016 18:38:01 -0300
Subject: usb: devio: Add ioctl to disallow detaching kernel USB drivers.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The new USBDEVFS_DROP_PRIVILEGES ioctl allows a process to voluntarily
relinquish the ability to issue other ioctls that may interfere with
other processes and drivers that have claimed an interface on the
device.

This commit also includes a simple utility to be able to test the
ioctl, located at Documentation/usb/usbdevfs-drop-permissions.c

Example (with qemu-kvm's input device):

    $ lsusb
    ...
    Bus 001 Device 002: ID 0627:0001 Adomax Technology Co., Ltd

    $ usb-devices
    ...
    C:  #Ifs= 1 Cfg#= 1 Atr=a0 MxPwr=100mA
    I:  If#= 0 Alt= 0 #EPs= 1 Cls=03(HID  ) Sub=00 Prot=02 Driver=usbhid

    $ sudo ./usbdevfs-drop-permissions /dev/bus/usb/001/002
    OK: privileges dropped!
    Available options:
    [0] Exit now
    [1] Reset device. Should fail if device is in use
    [2] Claim 4 interfaces. Should succeed where not in use
    [3] Narrow interface permission mask
    Which option shall I run?: 1
    ERROR: USBDEVFS_RESET failed! (1 - Operation not permitted)
    Which test shall I run next?: 2
    ERROR claiming if 0 (1 - Operation not permitted)
    ERROR claiming if 1 (1 - Operation not permitted)
    ERROR claiming if 2 (1 - Operation not permitted)
    ERROR claiming if 3 (1 - Operation not permitted)
    Which test shall I run next?: 0

After unbinding usbhid:

    $ usb-devices
    ...
    I:  If#= 0 Alt= 0 #EPs= 1 Cls=03(HID  ) Sub=00 Prot=02 Driver=(none)

    $ sudo ./usbdevfs-drop-permissions /dev/bus/usb/001/002
    ...
    Which option shall I run?: 2
    OK: claimed if 0
    ERROR claiming if 1 (1 - Operation not permitted)
    ERROR claiming if 2 (1 - Operation not permitted)
    ERROR claiming if 3 (1 - Operation not permitted)
    Which test shall I run next?: 1
    OK: USBDEVFS_RESET succeeded
    Which test shall I run next?: 0

After unbinding usbhid and restricting the mask:

    $ sudo ./usbdevfs-drop-permissions /dev/bus/usb/001/002
    ...
    Which option shall I run?: 3
    Insert new mask: 0
    OK: privileges dropped!
    Which test shall I run next?: 2
    ERROR claiming if 0 (1 - Operation not permitted)
    ERROR claiming if 1 (1 - Operation not permitted)
    ERROR claiming if 2 (1 - Operation not permitted)
    ERROR claiming if 3 (1 - Operation not permitted)

Signed-off-by: Reilly Grant <reillyg@chromium.org>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Emilio López <emilio.lopez@collabora.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/DocBook/usb.tmpl                |  12 +++
 Documentation/usb/usbdevfs-drop-permissions.c | 120 ++++++++++++++++++++++++++
 drivers/usb/core/devio.c                      |  63 ++++++++++++--
 include/uapi/linux/usbdevice_fs.h             |   2 +
 4 files changed, 192 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/usb/usbdevfs-drop-permissions.c

(limited to 'include/uapi/linux')

diff --git a/Documentation/DocBook/usb.tmpl b/Documentation/DocBook/usb.tmpl
index 4cd5b2cd0f3d..bc776be0f19c 100644
--- a/Documentation/DocBook/usb.tmpl
+++ b/Documentation/DocBook/usb.tmpl
@@ -732,6 +732,18 @@ usbdev_ioctl (int fd, int ifno, unsigned request, void *param)
 		    or SET_INTERFACE.
 		    </para></warning></listitem></varlistentry>
 
+		<varlistentry><term>USBDEVFS_DROP_PRIVILEGES</term>
+		    <listitem><para>This is used to relinquish the ability
+		    to do certain operations which are considered to be
+		    privileged on a usbfs file descriptor.
+		    This includes claiming arbitrary interfaces, resetting
+		    a device on which there are currently claimed interfaces
+		    from other users, and issuing USBDEVFS_IOCTL calls.
+		    The ioctl parameter is a 32 bit mask of interfaces
+		    the user is allowed to claim on this file descriptor.
+		    You may issue this ioctl more than one time to narrow
+		    said mask.
+		    </para></listitem></varlistentry>
 		</variablelist>
 
 		</sect2>
diff --git a/Documentation/usb/usbdevfs-drop-permissions.c b/Documentation/usb/usbdevfs-drop-permissions.c
new file mode 100644
index 000000000000..6b8da6ef0c9a
--- /dev/null
+++ b/Documentation/usb/usbdevfs-drop-permissions.c
@@ -0,0 +1,120 @@
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <inttypes.h>
+#include <unistd.h>
+
+#include <linux/usbdevice_fs.h>
+
+/* For building without an updated set of headers */
+#ifndef USBDEVFS_DROP_PRIVILEGES
+#define USBDEVFS_DROP_PRIVILEGES		_IOW('U', 30, __u32)
+#define USBDEVFS_CAP_DROP_PRIVILEGES		0x40
+#endif
+
+void drop_privileges(int fd, uint32_t mask)
+{
+	int res;
+
+	res = ioctl(fd, USBDEVFS_DROP_PRIVILEGES, &mask);
+	if (res)
+		printf("ERROR: USBDEVFS_DROP_PRIVILEGES returned %d\n", res);
+	else
+		printf("OK: privileges dropped!\n");
+}
+
+void reset_device(int fd)
+{
+	int res;
+
+	res = ioctl(fd, USBDEVFS_RESET);
+	if (!res)
+		printf("OK: USBDEVFS_RESET succeeded\n");
+	else
+		printf("ERROR: reset failed! (%d - %s)\n",
+		       -res, strerror(-res));
+}
+
+void claim_some_intf(int fd)
+{
+	int i, res;
+
+	for (i = 0; i < 4; i++) {
+		res = ioctl(fd, USBDEVFS_CLAIMINTERFACE, &i);
+		if (!res)
+			printf("OK: claimed if %d\n", i);
+		else
+			printf("ERROR claiming if %d (%d - %s)\n",
+			       i, -res, strerror(-res));
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	uint32_t mask, caps;
+	int c, fd;
+
+	fd = open(argv[1], O_RDWR);
+	if (fd < 0) {
+		printf("Failed to open file\n");
+		goto err_fd;
+	}
+
+	/*
+	 * check if dropping privileges is supported,
+	 * bail on systems where the capability is not present
+	 */
+	ioctl(fd, USBDEVFS_GET_CAPABILITIES, &caps);
+	if (!(caps & USBDEVFS_CAP_DROP_PRIVILEGES)) {
+		printf("DROP_PRIVILEGES not supported\n");
+		goto err;
+	}
+
+	/*
+	 * Drop privileges but keep the ability to claim all
+	 * free interfaces (i.e., those not used by kernel drivers)
+	 */
+	drop_privileges(fd, -1U);
+
+	printf("Available options:\n"
+		"[0] Exit now\n"
+		"[1] Reset device. Should fail if device is in use\n"
+		"[2] Claim 4 interfaces. Should succeed where not in use\n"
+		"[3] Narrow interface permission mask\n"
+		"Which option shall I run?: ");
+
+	while (scanf("%d", &c) == 1) {
+		switch (c) {
+		case 0:
+			goto exit;
+		case 1:
+			reset_device(fd);
+			break;
+		case 2:
+			claim_some_intf(fd);
+			break;
+		case 3:
+			printf("Insert new mask: ");
+			scanf("%x", &mask);
+			drop_privileges(fd, mask);
+			break;
+		default:
+			printf("I don't recognize that\n");
+		}
+
+		printf("Which test shall I run next?: ");
+	}
+
+exit:
+	close(fd);
+	return 0;
+
+err:
+	close(fd);
+err_fd:
+	return 1;
+}
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index 39da1662b76f..52c4461dfccd 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -79,6 +79,8 @@ struct usb_dev_state {
 	unsigned long ifclaimed;
 	u32 secid;
 	u32 disabled_bulk_eps;
+	bool privileges_dropped;
+	unsigned long interface_allowed_mask;
 };
 
 struct usb_memory {
@@ -748,6 +750,10 @@ static int claimintf(struct usb_dev_state *ps, unsigned int ifnum)
 	if (test_bit(ifnum, &ps->ifclaimed))
 		return 0;
 
+	if (ps->privileges_dropped &&
+			!test_bit(ifnum, &ps->interface_allowed_mask))
+		return -EACCES;
+
 	intf = usb_ifnum_to_if(dev, ifnum);
 	if (!intf)
 		err = -ENOENT;
@@ -985,7 +991,7 @@ static int usbdev_open(struct inode *inode, struct file *file)
 	int ret;
 
 	ret = -ENOMEM;
-	ps = kmalloc(sizeof(struct usb_dev_state), GFP_KERNEL);
+	ps = kzalloc(sizeof(struct usb_dev_state), GFP_KERNEL);
 	if (!ps)
 		goto out_free_ps;
 
@@ -1013,17 +1019,15 @@ static int usbdev_open(struct inode *inode, struct file *file)
 
 	ps->dev = dev;
 	ps->file = file;
+	ps->interface_allowed_mask = 0xFFFFFFFF; /* 32 bits */
 	spin_lock_init(&ps->lock);
 	INIT_LIST_HEAD(&ps->list);
 	INIT_LIST_HEAD(&ps->async_pending);
 	INIT_LIST_HEAD(&ps->async_completed);
 	INIT_LIST_HEAD(&ps->memory_list);
 	init_waitqueue_head(&ps->wait);
-	ps->discsignr = 0;
 	ps->disc_pid = get_pid(task_pid(current));
 	ps->cred = get_current_cred();
-	ps->disccontext = NULL;
-	ps->ifclaimed = 0;
 	security_task_getsecid(current, &ps->secid);
 	smp_wmb();
 	list_add_tail(&ps->list, &dev->filelist);
@@ -1324,6 +1328,28 @@ static int proc_connectinfo(struct usb_dev_state *ps, void __user *arg)
 
 static int proc_resetdevice(struct usb_dev_state *ps)
 {
+	struct usb_host_config *actconfig = ps->dev->actconfig;
+	struct usb_interface *interface;
+	int i, number;
+
+	/* Don't allow a device reset if the process has dropped the
+	 * privilege to do such things and any of the interfaces are
+	 * currently claimed.
+	 */
+	if (ps->privileges_dropped && actconfig) {
+		for (i = 0; i < actconfig->desc.bNumInterfaces; ++i) {
+			interface = actconfig->interface[i];
+			number = interface->cur_altsetting->desc.bInterfaceNumber;
+			if (usb_interface_claimed(interface) &&
+					!test_bit(number, &ps->ifclaimed)) {
+				dev_warn(&ps->dev->dev,
+					"usbfs: interface %d claimed by %s while '%s' resets device\n",
+					number,	interface->dev.driver->name, current->comm);
+				return -EACCES;
+			}
+		}
+	}
+
 	return usb_reset_device(ps->dev);
 }
 
@@ -2090,6 +2116,9 @@ static int proc_ioctl(struct usb_dev_state *ps, struct usbdevfs_ioctl *ctl)
 	struct usb_interface    *intf = NULL;
 	struct usb_driver       *driver = NULL;
 
+	if (ps->privileges_dropped)
+		return -EACCES;
+
 	/* alloc buffer */
 	size = _IOC_SIZE(ctl->ioctl_code);
 	if (size > 0) {
@@ -2215,7 +2244,8 @@ static int proc_get_capabilities(struct usb_dev_state *ps, void __user *arg)
 	__u32 caps;
 
 	caps = USBDEVFS_CAP_ZERO_PACKET | USBDEVFS_CAP_NO_PACKET_SIZE_LIM |
-			USBDEVFS_CAP_REAP_AFTER_DISCONNECT | USBDEVFS_CAP_MMAP;
+			USBDEVFS_CAP_REAP_AFTER_DISCONNECT | USBDEVFS_CAP_MMAP |
+			USBDEVFS_CAP_DROP_PRIVILEGES;
 	if (!ps->dev->bus->no_stop_on_short)
 		caps |= USBDEVFS_CAP_BULK_CONTINUATION;
 	if (ps->dev->bus->sg_tablesize)
@@ -2242,6 +2272,9 @@ static int proc_disconnect_claim(struct usb_dev_state *ps, void __user *arg)
 	if (intf->dev.driver) {
 		struct usb_driver *driver = to_usb_driver(intf->dev.driver);
 
+		if (ps->privileges_dropped)
+			return -EACCES;
+
 		if ((dc.flags & USBDEVFS_DISCONNECT_CLAIM_IF_DRIVER) &&
 				strncmp(dc.driver, intf->dev.driver->name,
 					sizeof(dc.driver)) != 0)
@@ -2298,6 +2331,23 @@ static int proc_free_streams(struct usb_dev_state *ps, void __user *arg)
 	return r;
 }
 
+static int proc_drop_privileges(struct usb_dev_state *ps, void __user *arg)
+{
+	u32 data;
+
+	if (copy_from_user(&data, arg, sizeof(data)))
+		return -EFAULT;
+
+	/* This is an one way operation. Once privileges are
+	 * dropped, you cannot regain them. You may however reissue
+	 * this ioctl to shrink the allowed interfaces mask.
+	 */
+	ps->interface_allowed_mask &= data;
+	ps->privileges_dropped = true;
+
+	return 0;
+}
+
 /*
  * NOTE:  All requests here that have interface numbers as parameters
  * are assuming that somehow the configuration has been prevented from
@@ -2486,6 +2536,9 @@ static long usbdev_do_ioctl(struct file *file, unsigned int cmd,
 	case USBDEVFS_FREE_STREAMS:
 		ret = proc_free_streams(ps, p);
 		break;
+	case USBDEVFS_DROP_PRIVILEGES:
+		ret = proc_drop_privileges(ps, p);
+		break;
 	}
 
  done:
diff --git a/include/uapi/linux/usbdevice_fs.h b/include/uapi/linux/usbdevice_fs.h
index ecbd17650e6c..a8653a6f40df 100644
--- a/include/uapi/linux/usbdevice_fs.h
+++ b/include/uapi/linux/usbdevice_fs.h
@@ -135,6 +135,7 @@ struct usbdevfs_hub_portinfo {
 #define USBDEVFS_CAP_BULK_SCATTER_GATHER	0x08
 #define USBDEVFS_CAP_REAP_AFTER_DISCONNECT	0x10
 #define USBDEVFS_CAP_MMAP			0x20
+#define USBDEVFS_CAP_DROP_PRIVILEGES		0x40
 
 /* USBDEVFS_DISCONNECT_CLAIM flags & struct */
 
@@ -188,5 +189,6 @@ struct usbdevfs_streams {
 #define USBDEVFS_DISCONNECT_CLAIM  _IOR('U', 27, struct usbdevfs_disconnect_claim)
 #define USBDEVFS_ALLOC_STREAMS     _IOR('U', 28, struct usbdevfs_streams)
 #define USBDEVFS_FREE_STREAMS      _IOR('U', 29, struct usbdevfs_streams)
+#define USBDEVFS_DROP_PRIVILEGES   _IOW('U', 30, __u32)
 
 #endif /* _UAPI_LINUX_USBDEVICE_FS_H */
-- 
cgit v1.2.3


From d4f323672aa63713b7ca26da418f66cc30d3a41a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Thu, 3 Mar 2016 16:08:54 -0800
Subject: nfit, libnvdimm: clear poison command support

Add the boiler-plate for a 'clear error' command based on section
9.20.7.6 "Function Index 4 - Clear Uncorrectable Error" from the ACPI
6.1 specification, and add a reference implementation in nfit_test.

Reviewed-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/nfit.c              | 12 +++++++++++-
 drivers/nvdimm/bus.c             | 19 +++++++++++++++++++
 include/uapi/linux/ndctl.h       | 13 +++++++++++++
 tools/testing/nvdimm/test/nfit.c | 29 +++++++++++++++++++++++++++++
 4 files changed, 72 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/acpi/nfit.c b/drivers/acpi/nfit.c
index 0def4ebf5d43..c067d7414007 100644
--- a/drivers/acpi/nfit.c
+++ b/drivers/acpi/nfit.c
@@ -87,6 +87,7 @@ static struct acpi_device *to_acpi_dev(struct acpi_nfit_desc *acpi_desc)
 
 static int xlat_status(void *buf, unsigned int cmd)
 {
+	struct nd_cmd_clear_error *clear_err;
 	struct nd_cmd_ars_status *ars_status;
 	struct nd_cmd_ars_start *ars_start;
 	struct nd_cmd_ars_cap *ars_cap;
@@ -149,6 +150,15 @@ static int xlat_status(void *buf, unsigned int cmd)
 		if (ars_status->status >> 16)
 			return -EIO;
 		break;
+	case ND_CMD_CLEAR_ERROR:
+		clear_err = buf;
+		if (clear_err->status & 0xffff)
+			return -EIO;
+		if (!clear_err->cleared)
+			return -EIO;
+		if (clear_err->length > clear_err->cleared)
+			return clear_err->cleared;
+		break;
 	default:
 		break;
 	}
@@ -1002,7 +1012,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
 	if (!adev)
 		return;
 
-	for (i = ND_CMD_ARS_CAP; i <= ND_CMD_ARS_STATUS; i++)
+	for (i = ND_CMD_ARS_CAP; i <= ND_CMD_CLEAR_ERROR; i++)
 		if (acpi_check_dsm(adev->handle, uuid, 1, 1ULL << i))
 			set_bit(i, &nd_desc->dsm_mask);
 }
diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 2e9ac22595ec..cb6fd64b13e3 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -421,6 +421,12 @@ static const struct nd_cmd_desc __nd_cmd_bus_descs[] = {
 		.out_num = 3,
 		.out_sizes = { 4, 4, UINT_MAX, },
 	},
+	[ND_CMD_CLEAR_ERROR] = {
+		.in_num = 2,
+		.in_sizes = { 8, 8, },
+		.out_num = 3,
+		.out_sizes = { 4, 4, 8, },
+	},
 };
 
 const struct nd_cmd_desc *nd_cmd_bus_desc(int cmd)
@@ -489,6 +495,13 @@ void wait_nvdimm_bus_probe_idle(struct device *dev)
 	} while (true);
 }
 
+static int pmem_active(struct device *dev, void *data)
+{
+	if (is_nd_pmem(dev) && dev->driver)
+		return -EBUSY;
+	return 0;
+}
+
 /* set_config requires an idle interleave set */
 static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus,
 		struct nvdimm *nvdimm, unsigned int cmd)
@@ -503,6 +516,11 @@ static int nd_cmd_clear_to_send(struct nvdimm_bus *nvdimm_bus,
 			return rc;
 	}
 
+	/* require clear error to go through the pmem driver */
+	if (!nvdimm && cmd == ND_CMD_CLEAR_ERROR)
+		return device_for_each_child(&nvdimm_bus->dev, NULL,
+				pmem_active);
+
 	if (!nvdimm || cmd != ND_CMD_SET_CONFIG_DATA)
 		return 0;
 
@@ -551,6 +569,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm,
 		case ND_CMD_VENDOR:
 		case ND_CMD_SET_CONFIG_DATA:
 		case ND_CMD_ARS_START:
+		case ND_CMD_CLEAR_ERROR:
 			dev_dbg(&nvdimm_bus->dev, "'%s' command while read-only.\n",
 					nvdimm ? nvdimm_cmd_name(cmd)
 					: nvdimm_bus_cmd_name(cmd));
diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
index cc68b92124d4..7cc28ab05b87 100644
--- a/include/uapi/linux/ndctl.h
+++ b/include/uapi/linux/ndctl.h
@@ -98,6 +98,14 @@ struct nd_cmd_ars_status {
 	} __packed records[0];
 } __packed;
 
+struct nd_cmd_clear_error {
+	__u64 address;
+	__u64 length;
+	__u32 status;
+	__u8 reserved[4];
+	__u64 cleared;
+} __packed;
+
 enum {
 	ND_CMD_IMPLEMENTED = 0,
 
@@ -105,6 +113,7 @@ enum {
 	ND_CMD_ARS_CAP = 1,
 	ND_CMD_ARS_START = 2,
 	ND_CMD_ARS_STATUS = 3,
+	ND_CMD_CLEAR_ERROR = 4,
 
 	/* per-dimm commands */
 	ND_CMD_SMART = 1,
@@ -129,6 +138,7 @@ static inline const char *nvdimm_bus_cmd_name(unsigned cmd)
 		[ND_CMD_ARS_CAP] = "ars_cap",
 		[ND_CMD_ARS_START] = "ars_start",
 		[ND_CMD_ARS_STATUS] = "ars_status",
+		[ND_CMD_CLEAR_ERROR] = "clear_error",
 	};
 
 	if (cmd < ARRAY_SIZE(names) && names[cmd])
@@ -187,6 +197,9 @@ static inline const char *nvdimm_cmd_name(unsigned cmd)
 #define ND_IOCTL_ARS_STATUS		_IOWR(ND_IOCTL, ND_CMD_ARS_STATUS,\
 					struct nd_cmd_ars_status)
 
+#define ND_IOCTL_CLEAR_ERROR		_IOWR(ND_IOCTL, ND_CMD_CLEAR_ERROR,\
+					struct nd_cmd_clear_error)
+
 #define ND_DEVICE_DIMM 1            /* nd_dimm: container for "config data" */
 #define ND_DEVICE_REGION_PMEM 2     /* nd_region: (parent of PMEM namespaces) */
 #define ND_DEVICE_REGION_BLK 3      /* nd_region: (parent of BLK namespaces) */
diff --git a/tools/testing/nvdimm/test/nfit.c b/tools/testing/nvdimm/test/nfit.c
index 1555c09efba1..3187322eeed7 100644
--- a/tools/testing/nvdimm/test/nfit.c
+++ b/tools/testing/nvdimm/test/nfit.c
@@ -223,6 +223,7 @@ static int nfit_test_cmd_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd,
 }
 
 #define NFIT_TEST_ARS_RECORDS 4
+#define NFIT_TEST_CLEAR_ERR_UNIT 256
 
 static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd,
 		unsigned int buf_len)
@@ -233,6 +234,7 @@ static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd,
 	nd_cmd->max_ars_out = sizeof(struct nd_cmd_ars_status)
 		+ NFIT_TEST_ARS_RECORDS * sizeof(struct nd_ars_record);
 	nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16;
+	nd_cmd->clear_err_unit = NFIT_TEST_CLEAR_ERR_UNIT;
 
 	return 0;
 }
@@ -306,6 +308,28 @@ static int nfit_test_cmd_ars_status(struct ars_state *ars_state,
 	return 0;
 }
 
+static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err,
+		unsigned int buf_len, int *cmd_rc)
+{
+	const u64 mask = NFIT_TEST_CLEAR_ERR_UNIT - 1;
+	if (buf_len < sizeof(*clear_err))
+		return -EINVAL;
+
+	if ((clear_err->address & mask) || (clear_err->length & mask))
+		return -EINVAL;
+
+	/*
+	 * Report 'all clear' success for all commands even though a new
+	 * scrub will find errors again.  This is enough to have the
+	 * error removed from the 'badblocks' tracking in the pmem
+	 * driver.
+	 */
+	clear_err->status = 0;
+	clear_err->cleared = clear_err->length;
+	*cmd_rc = 0;
+	return 0;
+}
+
 static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 		struct nvdimm *nvdimm, unsigned int cmd, void *buf,
 		unsigned int buf_len, int *cmd_rc)
@@ -365,6 +389,9 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
 			rc = nfit_test_cmd_ars_status(ars_state, buf, buf_len,
 					cmd_rc);
 			break;
+		case ND_CMD_CLEAR_ERROR:
+			rc = nfit_test_cmd_clear_error(buf, buf_len, cmd_rc);
+			break;
 		default:
 			return -ENOTTY;
 		}
@@ -1230,6 +1257,7 @@ static void nfit_test0_setup(struct nfit_test *t)
 	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
+	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_dsm_force_en);
 }
 
 static void nfit_test1_setup(struct nfit_test *t)
@@ -1290,6 +1318,7 @@ static void nfit_test1_setup(struct nfit_test *t)
 	set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_START, &acpi_desc->bus_dsm_force_en);
 	set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_dsm_force_en);
+	set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_dsm_force_en);
 }
 
 static int nfit_test_blk_do_io(struct nd_blk_region *ndbr, resource_size_t dpa,
-- 
cgit v1.2.3


From 30530791a7a032dc27dbbab56b8afabd5138074c Mon Sep 17 00:00:00 2001
From: Wilson Ding <dingwei@marvell.com>
Date: Tue, 16 Feb 2016 19:14:53 +0100
Subject: serial: mvebu-uart: initial support for Armada-3700 serial port

Armada-3700's uart is a simple serial port, which doesn't
support. Configuring the modem control lines. The uart port has a 32
bytes Tx FIFO and a 64 bytes Rx FIFO

The uart driver implements the uart core operations. It also support the
system (early) console based on Armada-3700's serial port.

Known Issue:

The uart driver currently doesn't support clock programming, which means
the baud-rate stays with the default value configured by the bootloader
at boot time

[gregory.clement@free-electrons.com: Rewrite many part which are too long
to enumerate]

Signed-off-by: Wilson Ding <dingwei@marvell.com>
Signed-off-by: Nadav Haklai <nadavh@marvell.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 .../devicetree/bindings/tty/serial/mvebu-uart.txt  |  13 +
 Documentation/kernel-parameters.txt                |   6 +
 drivers/tty/serial/Kconfig                         |  22 +
 drivers/tty/serial/Makefile                        |   1 +
 drivers/tty/serial/mvebu-uart.c                    | 650 +++++++++++++++++++++
 include/uapi/linux/serial_core.h                   |   3 +
 6 files changed, 695 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/tty/serial/mvebu-uart.txt
 create mode 100644 drivers/tty/serial/mvebu-uart.c

(limited to 'include/uapi/linux')

diff --git a/Documentation/devicetree/bindings/tty/serial/mvebu-uart.txt b/Documentation/devicetree/bindings/tty/serial/mvebu-uart.txt
new file mode 100644
index 000000000000..6087defd9f93
--- /dev/null
+++ b/Documentation/devicetree/bindings/tty/serial/mvebu-uart.txt
@@ -0,0 +1,13 @@
+* Marvell UART : Non standard UART used in some of Marvell EBU SoCs (e.g., Armada-3700)
+
+Required properties:
+- compatible: "marvell,armada-3700-uart"
+- reg: offset and length of the register set for the device.
+- interrupts: device interrupt
+
+Example:
+	serial@12000 {
+		compatible = "marvell,armada-3700-uart";
+		reg = <0x12000 0x400>;
+		interrupts = <43>;
+	};
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9a53c929f017..1ffa9dba13fd 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1058,6 +1058,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			A valid base address must be provided, and the serial
 			port must already be setup and configured.
 
+		armada3700_uart,<addr>
+			Start an early, polled-mode console on the
+			Armada 3700 serial port at the specified
+			address. The serial port must already be setup
+			and configured. Options are not yet supported.
+
 	earlyprintk=	[X86,SH,BLACKFIN,ARM,M68k]
 			earlyprintk=vga
 			earlyprintk=efi
diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig
index de2481cf6e58..13d4ed6caac4 100644
--- a/drivers/tty/serial/Kconfig
+++ b/drivers/tty/serial/Kconfig
@@ -1602,6 +1602,28 @@ config SERIAL_STM32_CONSOLE
 	depends on SERIAL_STM32=y
 	select SERIAL_CORE_CONSOLE
 
+config SERIAL_MVEBU_UART
+	bool "Marvell EBU serial port support"
+	select SERIAL_CORE
+	help
+	  This driver is for Marvell EBU SoC's UART. If you have a machine
+	  based on the Armada-3700 SoC and wish to use the on-board serial
+	  port,
+	  say 'Y' here.
+	  Otherwise, say 'N'.
+
+config SERIAL_MVEBU_CONSOLE
+	bool "Console on Marvell EBU serial port"
+	depends on SERIAL_MVEBU_UART
+	select SERIAL_CORE_CONSOLE
+	select SERIAL_EARLYCON
+	default y
+	help
+	  Say 'Y' here if you wish to use Armada-3700 UART as the system console.
+	  (the system console is the device which receives all kernel messages
+	  and warnings and which allows logins in single user mode)
+	  Otherwise, say 'N'.
+
 endmenu
 
 config SERIAL_MCTRL_GPIO
diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile
index ceba33c4ebb4..8c261adac04e 100644
--- a/drivers/tty/serial/Makefile
+++ b/drivers/tty/serial/Makefile
@@ -90,6 +90,7 @@ obj-$(CONFIG_SERIAL_CONEXANT_DIGICOLOR)	+= digicolor-usart.o
 obj-$(CONFIG_SERIAL_MEN_Z135)	+= men_z135_uart.o
 obj-$(CONFIG_SERIAL_SPRD) += sprd_serial.o
 obj-$(CONFIG_SERIAL_STM32)	+= stm32-usart.o
+obj-$(CONFIG_SERIAL_MVEBU_UART)	+= mvebu-uart.o
 
 # GPIOLIB helpers for modem control lines
 obj-$(CONFIG_SERIAL_MCTRL_GPIO)	+= serial_mctrl_gpio.o
diff --git a/drivers/tty/serial/mvebu-uart.c b/drivers/tty/serial/mvebu-uart.c
new file mode 100644
index 000000000000..0ff27818bb87
--- /dev/null
+++ b/drivers/tty/serial/mvebu-uart.c
@@ -0,0 +1,650 @@
+/*
+* ***************************************************************************
+* Copyright (C) 2015 Marvell International Ltd.
+* ***************************************************************************
+* This program is free software: you can redistribute it and/or modify it
+* under the terms of the GNU General Public License as published by the Free
+* Software Foundation, either version 2 of the License, or any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program.  If not, see <http://www.gnu.org/licenses/>.
+* ***************************************************************************
+*/
+
+#include <linux/clk.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/serial.h>
+#include <linux/serial_core.h>
+#include <linux/slab.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+/* Register Map */
+#define UART_RBR		0x00
+#define  RBR_BRK_DET		BIT(15)
+#define  RBR_FRM_ERR_DET	BIT(14)
+#define  RBR_PAR_ERR_DET	BIT(13)
+#define  RBR_OVR_ERR_DET	BIT(12)
+
+#define UART_TSH		0x04
+
+#define UART_CTRL		0x08
+#define  CTRL_SOFT_RST		BIT(31)
+#define  CTRL_TXFIFO_RST	BIT(15)
+#define  CTRL_RXFIFO_RST	BIT(14)
+#define  CTRL_ST_MIRR_EN	BIT(13)
+#define  CTRL_LPBK_EN		BIT(12)
+#define  CTRL_SND_BRK_SEQ	BIT(11)
+#define  CTRL_PAR_EN		BIT(10)
+#define  CTRL_TWO_STOP		BIT(9)
+#define  CTRL_TX_HFL_INT	BIT(8)
+#define  CTRL_RX_HFL_INT	BIT(7)
+#define  CTRL_TX_EMP_INT	BIT(6)
+#define  CTRL_TX_RDY_INT	BIT(5)
+#define  CTRL_RX_RDY_INT	BIT(4)
+#define  CTRL_BRK_DET_INT	BIT(3)
+#define  CTRL_FRM_ERR_INT	BIT(2)
+#define  CTRL_PAR_ERR_INT	BIT(1)
+#define  CTRL_OVR_ERR_INT	BIT(0)
+#define  CTRL_RX_INT			(CTRL_RX_RDY_INT | CTRL_BRK_DET_INT |\
+	CTRL_FRM_ERR_INT | CTRL_PAR_ERR_INT | CTRL_OVR_ERR_INT)
+
+#define UART_STAT		0x0c
+#define  STAT_TX_FIFO_EMP	BIT(13)
+#define  STAT_RX_FIFO_EMP	BIT(12)
+#define  STAT_TX_FIFO_FUL	BIT(11)
+#define  STAT_TX_FIFO_HFL	BIT(10)
+#define  STAT_RX_TOGL		BIT(9)
+#define  STAT_RX_FIFO_FUL	BIT(8)
+#define  STAT_RX_FIFO_HFL	BIT(7)
+#define  STAT_TX_EMP		BIT(6)
+#define  STAT_TX_RDY		BIT(5)
+#define  STAT_RX_RDY		BIT(4)
+#define  STAT_BRK_DET		BIT(3)
+#define  STAT_FRM_ERR		BIT(2)
+#define  STAT_PAR_ERR		BIT(1)
+#define  STAT_OVR_ERR		BIT(0)
+#define  STAT_BRK_ERR		(STAT_BRK_DET | STAT_FRM_ERR | STAT_FRM_ERR\
+				 | STAT_PAR_ERR | STAT_OVR_ERR)
+
+#define UART_BRDV		0x10
+
+#define MVEBU_NR_UARTS		1
+
+#define MVEBU_UART_TYPE		"mvebu-uart"
+
+static struct uart_port mvebu_uart_ports[MVEBU_NR_UARTS];
+
+struct mvebu_uart_data {
+	struct uart_port *port;
+	struct clk       *clk;
+};
+
+/* Core UART Driver Operations */
+static unsigned int mvebu_uart_tx_empty(struct uart_port *port)
+{
+	unsigned long flags;
+	unsigned int st;
+
+	spin_lock_irqsave(&port->lock, flags);
+	st = readl(port->membase + UART_STAT);
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return (st & STAT_TX_FIFO_EMP) ? TIOCSER_TEMT : 0;
+}
+
+static unsigned int mvebu_uart_get_mctrl(struct uart_port *port)
+{
+	return TIOCM_CTS | TIOCM_DSR | TIOCM_CAR;
+}
+
+static void mvebu_uart_set_mctrl(struct uart_port *port,
+				 unsigned int mctrl)
+{
+/*
+ * Even if we do not support configuring the modem control lines, this
+ * function must be proided to the serial core
+ */
+}
+
+static void mvebu_uart_stop_tx(struct uart_port *port)
+{
+	unsigned int ctl = readl(port->membase + UART_CTRL);
+
+	ctl &= ~CTRL_TX_RDY_INT;
+	writel(ctl, port->membase + UART_CTRL);
+}
+
+static void mvebu_uart_start_tx(struct uart_port *port)
+{
+	unsigned int ctl = readl(port->membase + UART_CTRL);
+
+	ctl |= CTRL_TX_RDY_INT;
+	writel(ctl, port->membase + UART_CTRL);
+}
+
+static void mvebu_uart_stop_rx(struct uart_port *port)
+{
+	unsigned int ctl = readl(port->membase + UART_CTRL);
+
+	ctl &= ~CTRL_RX_INT;
+	writel(ctl, port->membase + UART_CTRL);
+}
+
+static void mvebu_uart_break_ctl(struct uart_port *port, int brk)
+{
+	unsigned int ctl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	ctl = readl(port->membase + UART_CTRL);
+	if (brk == -1)
+		ctl |= CTRL_SND_BRK_SEQ;
+	else
+		ctl &= ~CTRL_SND_BRK_SEQ;
+	writel(ctl, port->membase + UART_CTRL);
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void mvebu_uart_rx_chars(struct uart_port *port, unsigned int status)
+{
+	struct tty_port *tport = &port->state->port;
+	unsigned char ch = 0;
+	char flag = 0;
+
+	do {
+		if (status & STAT_RX_RDY) {
+			ch = readl(port->membase + UART_RBR);
+			ch &= 0xff;
+			flag = TTY_NORMAL;
+			port->icount.rx++;
+
+			if (status & STAT_PAR_ERR)
+				port->icount.parity++;
+		}
+
+		if (status & STAT_BRK_DET) {
+			port->icount.brk++;
+			status &= ~(STAT_FRM_ERR | STAT_PAR_ERR);
+			if (uart_handle_break(port))
+				goto ignore_char;
+		}
+
+		if (status & STAT_OVR_ERR)
+			port->icount.overrun++;
+
+		if (status & STAT_FRM_ERR)
+			port->icount.frame++;
+
+		if (uart_handle_sysrq_char(port, ch))
+			goto ignore_char;
+
+		if (status & port->ignore_status_mask & STAT_PAR_ERR)
+			status &= ~STAT_RX_RDY;
+
+		status &= port->read_status_mask;
+
+		if (status & STAT_PAR_ERR)
+			flag = TTY_PARITY;
+
+		status &= ~port->ignore_status_mask;
+
+		if (status & STAT_RX_RDY)
+			tty_insert_flip_char(tport, ch, flag);
+
+		if (status & STAT_BRK_DET)
+			tty_insert_flip_char(tport, 0, TTY_BREAK);
+
+		if (status & STAT_FRM_ERR)
+			tty_insert_flip_char(tport, 0, TTY_FRAME);
+
+		if (status & STAT_OVR_ERR)
+			tty_insert_flip_char(tport, 0, TTY_OVERRUN);
+
+ignore_char:
+		status = readl(port->membase + UART_STAT);
+	} while (status & (STAT_RX_RDY | STAT_BRK_DET));
+
+	tty_flip_buffer_push(tport);
+}
+
+static void mvebu_uart_tx_chars(struct uart_port *port, unsigned int status)
+{
+	struct circ_buf *xmit = &port->state->xmit;
+	unsigned int count;
+	unsigned int st;
+
+	if (port->x_char) {
+		writel(port->x_char, port->membase + UART_TSH);
+		port->icount.tx++;
+		port->x_char = 0;
+		return;
+	}
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port)) {
+		mvebu_uart_stop_tx(port);
+		return;
+	}
+
+	for (count = 0; count < port->fifosize; count++) {
+		writel(xmit->buf[xmit->tail], port->membase + UART_TSH);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+
+		if (uart_circ_empty(xmit))
+			break;
+
+		st = readl(port->membase + UART_STAT);
+		if (st & STAT_TX_FIFO_FUL)
+			break;
+	}
+
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(port);
+
+	if (uart_circ_empty(xmit))
+		mvebu_uart_stop_tx(port);
+}
+
+static irqreturn_t mvebu_uart_isr(int irq, void *dev_id)
+{
+	struct uart_port *port = (struct uart_port *)dev_id;
+	unsigned int st = readl(port->membase + UART_STAT);
+
+	if (st & (STAT_RX_RDY | STAT_OVR_ERR | STAT_FRM_ERR | STAT_BRK_DET))
+		mvebu_uart_rx_chars(port, st);
+
+	if (st & STAT_TX_RDY)
+		mvebu_uart_tx_chars(port, st);
+
+	return IRQ_HANDLED;
+}
+
+static int mvebu_uart_startup(struct uart_port *port)
+{
+	int ret;
+
+	writel(CTRL_TXFIFO_RST | CTRL_RXFIFO_RST,
+	       port->membase + UART_CTRL);
+	udelay(1);
+	writel(CTRL_RX_INT, port->membase + UART_CTRL);
+
+	ret = request_irq(port->irq, mvebu_uart_isr, port->irqflags, "serial",
+			  port);
+	if (ret) {
+		dev_err(port->dev, "failed to request irq\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static void mvebu_uart_shutdown(struct uart_port *port)
+{
+	writel(0, port->membase + UART_CTRL);
+}
+
+static void mvebu_uart_set_termios(struct uart_port *port,
+				   struct ktermios *termios,
+				   struct ktermios *old)
+{
+	unsigned long flags;
+	unsigned int baud;
+
+	spin_lock_irqsave(&port->lock, flags);
+
+	port->read_status_mask = STAT_RX_RDY | STAT_OVR_ERR |
+		STAT_TX_RDY | STAT_TX_FIFO_FUL;
+
+	if (termios->c_iflag & INPCK)
+		port->read_status_mask |= STAT_FRM_ERR | STAT_PAR_ERR;
+
+	port->ignore_status_mask = 0;
+	if (termios->c_iflag & IGNPAR)
+		port->ignore_status_mask |=
+			STAT_FRM_ERR | STAT_PAR_ERR | STAT_OVR_ERR;
+
+	if ((termios->c_cflag & CREAD) == 0)
+		port->ignore_status_mask |= STAT_RX_RDY | STAT_BRK_ERR;
+
+	if (old)
+		tty_termios_copy_hw(termios, old);
+
+	baud = uart_get_baud_rate(port, termios, old, 0, 460800);
+	uart_update_timeout(port, termios->c_cflag, baud);
+
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static const char *mvebu_uart_type(struct uart_port *port)
+{
+	return MVEBU_UART_TYPE;
+}
+
+static void mvebu_uart_release_port(struct uart_port *port)
+{
+	/* Nothing to do here */
+}
+
+static int mvebu_uart_request_port(struct uart_port *port)
+{
+	return 0;
+}
+
+#ifdef CONFIG_CONSOLE_POLL
+static int mvebu_uart_get_poll_char(struct uart_port *port)
+{
+	unsigned int st = readl(port->membase + UART_STAT);
+
+	if (!(st & STAT_RX_RDY))
+		return NO_POLL_CHAR;
+
+	return readl(port->membase + UART_RBR);
+}
+
+static void mvebu_uart_put_poll_char(struct uart_port *port, unsigned char c)
+{
+	unsigned int st;
+
+	for (;;) {
+		st = readl(port->membase + UART_STAT);
+
+		if (!(st & STAT_TX_FIFO_FUL))
+			break;
+
+		udelay(1);
+	}
+
+	writel(c, port->membase + UART_TSH);
+}
+#endif
+
+static const struct uart_ops mvebu_uart_ops = {
+	.tx_empty	= mvebu_uart_tx_empty,
+	.set_mctrl	= mvebu_uart_set_mctrl,
+	.get_mctrl	= mvebu_uart_get_mctrl,
+	.stop_tx	= mvebu_uart_stop_tx,
+	.start_tx	= mvebu_uart_start_tx,
+	.stop_rx	= mvebu_uart_stop_rx,
+	.break_ctl	= mvebu_uart_break_ctl,
+	.startup	= mvebu_uart_startup,
+	.shutdown	= mvebu_uart_shutdown,
+	.set_termios	= mvebu_uart_set_termios,
+	.type		= mvebu_uart_type,
+	.release_port	= mvebu_uart_release_port,
+	.request_port	= mvebu_uart_request_port,
+#ifdef CONFIG_CONSOLE_POLL
+	.poll_get_char	= mvebu_uart_get_poll_char,
+	.poll_put_char	= mvebu_uart_put_poll_char,
+#endif
+};
+
+/* Console Driver Operations  */
+
+#ifdef CONFIG_SERIAL_MVEBU_CONSOLE
+/* Early Console */
+static void mvebu_uart_putc(struct uart_port *port, int c)
+{
+	unsigned int st;
+
+	for (;;) {
+		st = readl(port->membase + UART_STAT);
+		if (!(st & STAT_TX_FIFO_FUL))
+			break;
+	}
+
+	writel(c, port->membase + UART_TSH);
+
+	for (;;) {
+		st = readl(port->membase + UART_STAT);
+		if (st & STAT_TX_FIFO_EMP)
+			break;
+	}
+}
+
+static void mvebu_uart_putc_early_write(struct console *con,
+					const char *s,
+					unsigned n)
+{
+	struct earlycon_device *dev = con->data;
+
+	uart_console_write(&dev->port, s, n, mvebu_uart_putc);
+}
+
+static int __init
+mvebu_uart_early_console_setup(struct earlycon_device *device,
+			       const char *opt)
+{
+	if (!device->port.membase)
+		return -ENODEV;
+
+	device->con->write = mvebu_uart_putc_early_write;
+
+	return 0;
+}
+
+EARLYCON_DECLARE(ar3700_uart, mvebu_uart_early_console_setup);
+OF_EARLYCON_DECLARE(ar3700_uart, "marvell,armada-3700-uart",
+		    mvebu_uart_early_console_setup);
+
+static void wait_for_xmitr(struct uart_port *port)
+{
+	u32 val;
+
+	readl_poll_timeout_atomic(port->membase + UART_STAT, val,
+				  (val & STAT_TX_EMP), 1, 10000);
+}
+
+static void mvebu_uart_console_putchar(struct uart_port *port, int ch)
+{
+	wait_for_xmitr(port);
+	writel(ch, port->membase + UART_TSH);
+}
+
+static void mvebu_uart_console_write(struct console *co, const char *s,
+				     unsigned int count)
+{
+	struct uart_port *port = &mvebu_uart_ports[co->index];
+	unsigned long flags;
+	unsigned int ier;
+	int locked = 1;
+
+	if (oops_in_progress)
+		locked = spin_trylock_irqsave(&port->lock, flags);
+	else
+		spin_lock_irqsave(&port->lock, flags);
+
+	ier = readl(port->membase + UART_CTRL) &
+		(CTRL_RX_INT | CTRL_TX_RDY_INT);
+	writel(0, port->membase + UART_CTRL);
+
+	uart_console_write(port, s, count, mvebu_uart_console_putchar);
+
+	wait_for_xmitr(port);
+
+	if (ier)
+		writel(ier, port->membase + UART_CTRL);
+
+	if (locked)
+		spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static int mvebu_uart_console_setup(struct console *co, char *options)
+{
+	struct uart_port *port;
+	int baud = 9600;
+	int bits = 8;
+	int parity = 'n';
+	int flow = 'n';
+
+	if (co->index < 0 || co->index >= MVEBU_NR_UARTS)
+		return -EINVAL;
+
+	port = &mvebu_uart_ports[co->index];
+
+	if (!port->mapbase || !port->membase) {
+		pr_debug("console on ttyMV%i not present\n", co->index);
+		return -ENODEV;
+	}
+
+	if (options)
+		uart_parse_options(options, &baud, &parity, &bits, &flow);
+
+	return uart_set_options(port, co, baud, parity, bits, flow);
+}
+
+static struct uart_driver mvebu_uart_driver;
+
+static struct console mvebu_uart_console = {
+	.name	= "ttyMV",
+	.write	= mvebu_uart_console_write,
+	.device	= uart_console_device,
+	.setup	= mvebu_uart_console_setup,
+	.flags	= CON_PRINTBUFFER,
+	.index	= -1,
+	.data	= &mvebu_uart_driver,
+};
+
+static int __init mvebu_uart_console_init(void)
+{
+	register_console(&mvebu_uart_console);
+	return 0;
+}
+
+console_initcall(mvebu_uart_console_init);
+
+
+#endif /* CONFIG_SERIAL_MVEBU_CONSOLE */
+
+static struct uart_driver mvebu_uart_driver = {
+	.owner			= THIS_MODULE,
+	.driver_name		= "mvebu_serial",
+	.dev_name		= "ttyMV",
+	.nr			= MVEBU_NR_UARTS,
+#ifdef CONFIG_SERIAL_MVEBU_CONSOLE
+	.cons			= &mvebu_uart_console,
+#endif
+};
+
+static int mvebu_uart_probe(struct platform_device *pdev)
+{
+	struct resource *reg = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	struct resource *irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	struct uart_port *port;
+	struct mvebu_uart_data *data;
+	int ret;
+
+	if (!reg || !irq) {
+		dev_err(&pdev->dev, "no registers/irq defined\n");
+		return -EINVAL;
+	}
+
+	port = &mvebu_uart_ports[0];
+
+	spin_lock_init(&port->lock);
+
+	port->dev        = &pdev->dev;
+	port->type       = PORT_MVEBU;
+	port->ops        = &mvebu_uart_ops;
+	port->regshift   = 0;
+
+	port->fifosize   = 32;
+	port->iotype     = UPIO_MEM32;
+	port->flags      = UPF_FIXED_PORT;
+	port->line       = 0; /* single port: force line number to  0 */
+
+	port->irq        = irq->start;
+	port->irqflags   = 0;
+	port->mapbase    = reg->start;
+
+	port->membase = devm_ioremap_resource(&pdev->dev, reg);
+	if (IS_ERR(port->membase))
+		return -PTR_ERR(port->membase);
+
+	data = devm_kzalloc(&pdev->dev, sizeof(struct mvebu_uart_data),
+			    GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	data->port = port;
+
+	port->private_data = data;
+	platform_set_drvdata(pdev, data);
+
+	ret = uart_add_one_port(&mvebu_uart_driver, port);
+	if (ret)
+		return ret;
+	return 0;
+}
+
+static int mvebu_uart_remove(struct platform_device *pdev)
+{
+	struct mvebu_uart_data *data = platform_get_drvdata(pdev);
+
+	uart_remove_one_port(&mvebu_uart_driver, data->port);
+	data->port->private_data = NULL;
+	data->port->mapbase      = 0;
+	return 0;
+}
+
+/* Match table for of_platform binding */
+static const struct of_device_id mvebu_uart_of_match[] = {
+	{ .compatible = "marvell,armada-3700-uart", },
+	{}
+};
+MODULE_DEVICE_TABLE(of, mvebu_uart_of_match);
+
+static struct platform_driver mvebu_uart_platform_driver = {
+	.probe	= mvebu_uart_probe,
+	.remove	= mvebu_uart_remove,
+	.driver	= {
+		.owner	= THIS_MODULE,
+		.name  = "mvebu-uart",
+		.of_match_table = of_match_ptr(mvebu_uart_of_match),
+	},
+};
+
+static int __init mvebu_uart_init(void)
+{
+	int ret;
+
+	ret = uart_register_driver(&mvebu_uart_driver);
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&mvebu_uart_platform_driver);
+	if (ret)
+		uart_unregister_driver(&mvebu_uart_driver);
+
+	return ret;
+}
+
+static void __exit mvebu_uart_exit(void)
+{
+	platform_driver_unregister(&mvebu_uart_platform_driver);
+	uart_unregister_driver(&mvebu_uart_driver);
+}
+
+arch_initcall(mvebu_uart_init);
+module_exit(mvebu_uart_exit);
+
+MODULE_AUTHOR("Wilson Ding <dingwei@marvell.com>");
+MODULE_DESCRIPTION("Marvell Armada-3700 Serial Driver");
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h
index 3e5d757407fb..e513a4ee369b 100644
--- a/include/uapi/linux/serial_core.h
+++ b/include/uapi/linux/serial_core.h
@@ -261,4 +261,7 @@
 /* STM32 USART */
 #define PORT_STM32	113
 
+/* MVEBU UART */
+#define PORT_MVEBU	114
+
 #endif /* _UAPILINUX_SERIAL_CORE_H */
-- 
cgit v1.2.3


From 8afd54c87ad7089734ef0527937a256586ba828a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:03 +0100
Subject: bpf: add flags to bpf_skb_store_bytes for clearing hash

When overwriting parts of the packet with bpf_skb_store_bytes() that
were fed previously into skb->hash calculation, we should clear the
current hash with skb_clear_hash(), so that a next skb_get_hash() call
can determine the correct hash related to this skb.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ee2193287cbe..2e3e90309904 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -305,6 +305,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_skb_store_bytes flags. */
 #define BPF_F_RECOMPUTE_CSUM		(1ULL << 0)
+#define BPF_F_INVALIDATE_HASH		(1ULL << 1)
 
 /* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags.
  * First 4 bits are for passing the header field size.
diff --git a/net/core/filter.c b/net/core/filter.c
index 356a251657a5..a1fe246a6147 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1353,7 +1353,7 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 	unsigned int len = (unsigned int) r4;
 	void *ptr;
 
-	if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM)))
+	if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
 		return -EINVAL;
 
 	/* bpf verifier guarantees that:
@@ -1384,6 +1384,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
 
 	if (flags & BPF_F_RECOMPUTE_CSUM)
 		skb_postpush_rcsum(skb, ptr, len);
+	if (flags & BPF_F_INVALIDATE_HASH)
+		skb_clear_hash(skb);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 2208087061c4ad88de188911367effc550144836 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:05 +0100
Subject: bpf: allow to propagate df in bpf_skb_set_tunnel_key

Added by 9a628224a61b ("ip_tunnel: Add dont fragment flag."), allow to
feed df flag into tunneling facilities (currently supported on TX by
vxlan, geneve and gre) as a hint from eBPF's bpf_skb_set_tunnel_key()
helper.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 2e3e90309904..21ee6d52016f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -330,6 +330,7 @@ enum bpf_func_id {
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
+#define BPF_F_DONT_FRAGMENT		(1ULL << 2)
 
 /* user accessible mirror of in-kernel sk_buff.
  * new fields can only be added to the end of this structure
diff --git a/net/core/filter.c b/net/core/filter.c
index ce4e18dd2c89..6c9d15561d04 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1819,7 +1819,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	u8 compat[sizeof(struct bpf_tunnel_key)];
 	struct ip_tunnel_info *info;
 
-	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX)))
+	if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
+			       BPF_F_DONT_FRAGMENT)))
 		return -EINVAL;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
@@ -1844,6 +1845,9 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	info->mode = IP_TUNNEL_INFO_TX;
 
 	info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM;
+	if (flags & BPF_F_DONT_FRAGMENT)
+		info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+
 	info->key.tun_id = cpu_to_be64(from->tunnel_id);
 	info->key.tos = from->tunnel_tos;
 	info->key.ttl = from->tunnel_ttl;
-- 
cgit v1.2.3


From 14ca0751c96f8d3d0f52e8ed3b3236f8b34d3460 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Fri, 4 Mar 2016 15:15:06 +0100
Subject: bpf: support for access to tunnel options

After eBPF being able to programmatically access/manage tunnel key meta
data via commit d3aa45ce6b94 ("bpf: add helpers to access tunnel metadata")
and more recently also for IPv6 through c6c33454072f ("bpf: support ipv6
for bpf_skb_{set,get}_tunnel_key"), this work adds two complementary
helpers to generically access their auxiliary tunnel options.

Geneve and vxlan support this facility. For geneve, TLVs can be pushed,
and for the vxlan case its GBP extension. I.e. setting tunnel key for geneve
case only makes sense, if we can also read/write TLVs into it. In the GBP
case, it provides the flexibility to easily map the group policy ID in
combination with other helpers or maps.

I chose to model this as two separate helpers, bpf_skb_{set,get}_tunnel_opt(),
for a couple of reasons. bpf_skb_{set,get}_tunnel_key() is already rather
complex by itself, and there may be cases for tunnel key backends where
tunnel options are not always needed. If we would have integrated this
into bpf_skb_{set,get}_tunnel_key() nevertheless, we are very limited with
remaining helper arguments, so keeping compatibility on structs in case of
passing in a flat buffer gets more cumbersome. Separating both also allows
for more flexibility and future extensibility, f.e. options could be fed
directly from a map, etc.

Moreover, change geneve's xmit path to test only for info->options_len
instead of TUNNEL_GENEVE_OPT flag. This makes it more consistent with vxlan's
xmit path and allows for avoiding to specify a protocol flag in the API on
xmit, so it can be protocol agnostic. Having info->options_len is enough
information that is needed. Tested with vxlan and geneve.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c     |  4 +--
 include/uapi/linux/bpf.h | 11 +++++++
 net/core/filter.c        | 83 ++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 90 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index bc5da357e16d..36db4cf0579c 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -940,7 +940,7 @@ static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		u8 vni[3];
 
 		tunnel_id_to_vni(key->tun_id, vni);
-		if (key->tun_flags & TUNNEL_GENEVE_OPT)
+		if (info->options_len)
 			opts = ip_tunnel_info_opts(info);
 
 		if (key->tun_flags & TUNNEL_CSUM)
@@ -1027,7 +1027,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		u8 vni[3];
 
 		tunnel_id_to_vni(key->tun_id, vni);
-		if (key->tun_flags & TUNNEL_GENEVE_OPT)
+		if (info->options_len)
 			opts = ip_tunnel_info_opts(info);
 
 		if (key->tun_flags & TUNNEL_CSUM)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 21ee6d52016f..9221f653fee3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -298,6 +298,17 @@ enum bpf_func_id {
 	 * Return: csum result
 	 */
 	BPF_FUNC_csum_diff,
+
+	/**
+	 * bpf_skb_[gs]et_tunnel_opt(skb, opt, size)
+	 * retrieve or populate tunnel options metadata
+	 * @skb: pointer to skb
+	 * @opt: pointer to raw tunnel option data
+	 * @size: size of @opt
+	 * Return: 0 on success for set, option size for get
+	 */
+	BPF_FUNC_skb_get_tunnel_opt,
+	BPF_FUNC_skb_set_tunnel_opt,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 6c9d15561d04..012a10c2da94 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1809,6 +1809,32 @@ static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	u8 *to = (u8 *) (long) r2;
+	const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+	if (unlikely(!info ||
+		     !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)))
+		return -ENOENT;
+	if (unlikely(size < info->options_len))
+		return -ENOMEM;
+
+	ip_tunnel_info_opts_get(to, info);
+
+	return info->options_len;
+}
+
+static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
+	.func		= bpf_skb_get_tunnel_opt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+};
+
 static struct metadata_dst __percpu *md_dst;
 
 static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
@@ -1875,17 +1901,58 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
-static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void)
+#define BPF_TUNLEN_MAX	255
+
+static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+	struct sk_buff *skb = (struct sk_buff *) (long) r1;
+	u8 *from = (u8 *) (long) r2;
+	struct ip_tunnel_info *info = skb_tunnel_info(skb);
+	const struct metadata_dst *md = this_cpu_ptr(md_dst);
+
+	if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
+		return -EINVAL;
+	if (unlikely(size > BPF_TUNLEN_MAX))
+		return -ENOMEM;
+
+	ip_tunnel_info_opts_set(info, from, size);
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
+	.func		= bpf_skb_set_tunnel_opt,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_STACK,
+	.arg3_type	= ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *
+bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
 {
 	if (!md_dst) {
-		/* race is not possible, since it's called from
-		 * verifier that is holding verifier mutex
+		BUILD_BUG_ON(FIELD_SIZEOF(struct ip_tunnel_info,
+					  options_len) != 1);
+
+		/* Race is not possible, since it's called from verifier
+		 * that is holding verifier mutex.
 		 */
-		md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL);
+		md_dst = metadata_dst_alloc_percpu(BPF_TUNLEN_MAX,
+						   GFP_KERNEL);
 		if (!md_dst)
 			return NULL;
 	}
-	return &bpf_skb_set_tunnel_key_proto;
+
+	switch (which) {
+	case BPF_FUNC_skb_set_tunnel_key:
+		return &bpf_skb_set_tunnel_key_proto;
+	case BPF_FUNC_skb_set_tunnel_opt:
+		return &bpf_skb_set_tunnel_opt_proto;
+	default:
+		return NULL;
+	}
 }
 
 static const struct bpf_func_proto *
@@ -1939,7 +2006,11 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_skb_get_tunnel_key:
 		return &bpf_skb_get_tunnel_key_proto;
 	case BPF_FUNC_skb_set_tunnel_key:
-		return bpf_get_skb_set_tunnel_key_proto();
+		return bpf_get_skb_set_tunnel_proto(func_id);
+	case BPF_FUNC_skb_get_tunnel_opt:
+		return &bpf_skb_get_tunnel_opt_proto;
+	case BPF_FUNC_skb_set_tunnel_opt:
+		return bpf_get_skb_set_tunnel_proto(func_id);
 	case BPF_FUNC_redirect:
 		return &bpf_redirect_proto;
 	case BPF_FUNC_get_route_realm:
-- 
cgit v1.2.3


From 6c90598174322b8888029e40dd84a4eb01f56afe Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 7 Mar 2016 21:57:15 -0800
Subject: bpf: pre-allocate hash map elements

If kprobe is placed on spin_unlock then calling kmalloc/kfree from
bpf programs is not safe, since the following dead lock is possible:
kfree->spin_lock(kmem_cache_node->lock)...spin_unlock->kprobe->
bpf_prog->map_update->kmalloc->spin_lock(of the same kmem_cache_node->lock)
and deadlocks.

The following solutions were considered and some implemented, but
eventually discarded
- kmem_cache_create for every map
- add recursion check to slow-path of slub
- use reserved memory in bpf_map_update for in_irq or in preempt_disabled
- kmalloc via irq_work

At the end pre-allocation of all map elements turned out to be the simplest
solution and since the user is charged upfront for all the memory, such
pre-allocation doesn't affect the user space visible behavior.

Since it's impossible to tell whether kprobe is triggered in a safe
location from kmalloc point of view, use pre-allocation by default
and introduce new BPF_F_NO_PREALLOC flag.

While testing of per-cpu hash maps it was discovered
that alloc_percpu(GFP_ATOMIC) has odd corner cases and often
fails to allocate memory even when 90% of it is free.
The pre-allocation of per-cpu hash elements solves this problem as well.

Turned out that bpf_map_update() quickly followed by
bpf_map_lookup()+bpf_map_delete() is very common pattern used
in many of iovisor/bcc/tools, so there is additional benefit of
pre-allocation, since such use cases are must faster.

Since all hash map elements are now pre-allocated we can remove
atomic increment of htab->count and save few more cycles.

Also add bpf_map_precharge_memlock() to check rlimit_memlock early to avoid
large malloc/free done by users who don't have sufficient limits.

Pre-allocation is done with vmalloc and alloc/free is done
via percpu_freelist. Here are performance numbers for different
pre-allocation algorithms that were implemented, but discarded
in favor of percpu_freelist:

1 cpu:
pcpu_ida	2.1M
pcpu_ida nolock	2.3M
bt		2.4M
kmalloc		1.8M
hlist+spinlock	2.3M
pcpu_freelist	2.6M

4 cpu:
pcpu_ida	1.5M
pcpu_ida nolock	1.8M
bt w/smp_align	1.7M
bt no/smp_align	1.1M
kmalloc		0.7M
hlist+spinlock	0.2M
pcpu_freelist	2.0M

8 cpu:
pcpu_ida	0.7M
bt w/smp_align	0.8M
kmalloc		0.4M
pcpu_freelist	1.5M

32 cpu:
kmalloc		0.13M
pcpu_freelist	0.49M

pcpu_ida nolock is a modified percpu_ida algorithm without
percpu_ida_cpu locks and without cross-cpu tag stealing.
It's faster than existing percpu_ida, but not as fast as pcpu_freelist.

bt is a variant of block/blk-mq-tag.c simlified and customized
for bpf use case. bt w/smp_align is using cache line for every 'long'
(similar to blk-mq-tag). bt no/smp_align allocates 'long'
bitmasks continuously to save memory. It's comparable to percpu_ida
and in some cases faster, but slower than percpu_freelist

hlist+spinlock is the simplest free list with single spinlock.
As expeceted it has very bad scaling in SMP.

kmalloc is existing implementation which is still available via
BPF_F_NO_PREALLOC flag. It's significantly slower in single cpu and
in 8 cpu setup it's 3 times slower than pre-allocation with pcpu_freelist,
but saves memory, so in cases where map->max_entries can be large
and number of map update/delete per second is low, it may make
sense to use it.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h      |   2 +
 include/uapi/linux/bpf.h |   3 +
 kernel/bpf/hashtab.c     | 240 +++++++++++++++++++++++++++++++++--------------
 kernel/bpf/syscall.c     |  15 ++-
 4 files changed, 186 insertions(+), 74 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4b070827200d..efd1d4ca95c6 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -37,6 +37,7 @@ struct bpf_map {
 	u32 key_size;
 	u32 value_size;
 	u32 max_entries;
+	u32 map_flags;
 	u32 pages;
 	struct user_struct *user;
 	const struct bpf_map_ops *ops;
@@ -178,6 +179,7 @@ struct bpf_map *__bpf_map_get(struct fd f);
 void bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
+int bpf_map_precharge_memlock(u32 pages);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 9221f653fee3..0e30b19012a5 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -101,12 +101,15 @@ enum bpf_prog_type {
 #define BPF_NOEXIST	1 /* create new element if it didn't exist */
 #define BPF_EXIST	2 /* update existing element */
 
+#define BPF_F_NO_PREALLOC	(1U << 0)
+
 union bpf_attr {
 	struct { /* anonymous struct used by BPF_MAP_CREATE command */
 		__u32	map_type;	/* one of enum bpf_map_type */
 		__u32	key_size;	/* size of key in bytes */
 		__u32	value_size;	/* size of value in bytes */
 		__u32	max_entries;	/* max number of entries in a map */
+		__u32	map_flags;	/* prealloc or not */
 	};
 
 	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index a68e95133fcd..fff3650d52fc 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1,4 +1,5 @@
 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of version 2 of the GNU General Public
@@ -13,6 +14,7 @@
 #include <linux/jhash.h>
 #include <linux/filter.h>
 #include <linux/vmalloc.h>
+#include "percpu_freelist.h"
 
 struct bucket {
 	struct hlist_head head;
@@ -22,6 +24,8 @@ struct bucket {
 struct bpf_htab {
 	struct bpf_map map;
 	struct bucket *buckets;
+	void *elems;
+	struct pcpu_freelist freelist;
 	atomic_t count;	/* number of elements in this hashtable */
 	u32 n_buckets;	/* number of hash buckets */
 	u32 elem_size;	/* size of each element in bytes */
@@ -29,15 +33,86 @@ struct bpf_htab {
 
 /* each htab element is struct htab_elem + key + value */
 struct htab_elem {
-	struct hlist_node hash_node;
-	struct rcu_head rcu;
 	union {
-		u32 hash;
-		u32 key_size;
+		struct hlist_node hash_node;
+		struct bpf_htab *htab;
+		struct pcpu_freelist_node fnode;
 	};
+	struct rcu_head rcu;
+	u32 hash;
 	char key[0] __aligned(8);
 };
 
+static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
+				     void __percpu *pptr)
+{
+	*(void __percpu **)(l->key + key_size) = pptr;
+}
+
+static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
+{
+	return *(void __percpu **)(l->key + key_size);
+}
+
+static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
+{
+	return (struct htab_elem *) (htab->elems + i * htab->elem_size);
+}
+
+static void htab_free_elems(struct bpf_htab *htab)
+{
+	int i;
+
+	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+		goto free_elems;
+
+	for (i = 0; i < htab->map.max_entries; i++) {
+		void __percpu *pptr;
+
+		pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
+					 htab->map.key_size);
+		free_percpu(pptr);
+	}
+free_elems:
+	vfree(htab->elems);
+}
+
+static int prealloc_elems_and_freelist(struct bpf_htab *htab)
+{
+	int err = -ENOMEM, i;
+
+	htab->elems = vzalloc(htab->elem_size * htab->map.max_entries);
+	if (!htab->elems)
+		return -ENOMEM;
+
+	if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
+		goto skip_percpu_elems;
+
+	for (i = 0; i < htab->map.max_entries; i++) {
+		u32 size = round_up(htab->map.value_size, 8);
+		void __percpu *pptr;
+
+		pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
+		if (!pptr)
+			goto free_elems;
+		htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
+				  pptr);
+	}
+
+skip_percpu_elems:
+	err = pcpu_freelist_init(&htab->freelist);
+	if (err)
+		goto free_elems;
+
+	pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size,
+			       htab->map.max_entries);
+	return 0;
+
+free_elems:
+	htab_free_elems(htab);
+	return err;
+}
+
 /* Called from syscall */
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 {
@@ -46,6 +121,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	int err, i;
 	u64 cost;
 
+	if (attr->map_flags & ~BPF_F_NO_PREALLOC)
+		/* reserved bits should not be used */
+		return ERR_PTR(-EINVAL);
+
 	htab = kzalloc(sizeof(*htab), GFP_USER);
 	if (!htab)
 		return ERR_PTR(-ENOMEM);
@@ -55,6 +134,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	htab->map.key_size = attr->key_size;
 	htab->map.value_size = attr->value_size;
 	htab->map.max_entries = attr->max_entries;
+	htab->map.map_flags = attr->map_flags;
 
 	/* check sanity of attributes.
 	 * value_size == 0 may be allowed in the future to use map as a set
@@ -92,7 +172,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 	if (percpu)
 		htab->elem_size += sizeof(void *);
 	else
-		htab->elem_size += htab->map.value_size;
+		htab->elem_size += round_up(htab->map.value_size, 8);
 
 	/* prevent zero size kmalloc and check for u32 overflow */
 	if (htab->n_buckets == 0 ||
@@ -112,6 +192,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 
 	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
 
+	/* if map size is larger than memlock limit, reject it early */
+	err = bpf_map_precharge_memlock(htab->map.pages);
+	if (err)
+		goto free_htab;
+
 	err = -ENOMEM;
 	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
 				      GFP_USER | __GFP_NOWARN);
@@ -127,10 +212,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		raw_spin_lock_init(&htab->buckets[i].lock);
 	}
 
-	atomic_set(&htab->count, 0);
+	if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
+		err = prealloc_elems_and_freelist(htab);
+		if (err)
+			goto free_buckets;
+	}
 
 	return &htab->map;
 
+free_buckets:
+	kvfree(htab->buckets);
 free_htab:
 	kfree(htab);
 	return ERR_PTR(err);
@@ -249,42 +340,42 @@ find_first_elem:
 		}
 	}
 
-	/* itereated over all buckets and all elements */
+	/* iterated over all buckets and all elements */
 	return -ENOENT;
 }
 
-
-static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
-				     void __percpu *pptr)
-{
-	*(void __percpu **)(l->key + key_size) = pptr;
-}
-
-static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
-{
-	return *(void __percpu **)(l->key + key_size);
-}
-
-static void htab_percpu_elem_free(struct htab_elem *l)
+static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
 {
-	free_percpu(htab_elem_get_ptr(l, l->key_size));
+	if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
+		free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
 	kfree(l);
+
 }
 
-static void htab_percpu_elem_free_rcu(struct rcu_head *head)
+static void htab_elem_free_rcu(struct rcu_head *head)
 {
 	struct htab_elem *l = container_of(head, struct htab_elem, rcu);
+	struct bpf_htab *htab = l->htab;
 
-	htab_percpu_elem_free(l);
+	/* must increment bpf_prog_active to avoid kprobe+bpf triggering while
+	 * we're calling kfree, otherwise deadlock is possible if kprobes
+	 * are placed somewhere inside of slub
+	 */
+	preempt_disable();
+	__this_cpu_inc(bpf_prog_active);
+	htab_elem_free(htab, l);
+	__this_cpu_dec(bpf_prog_active);
+	preempt_enable();
 }
 
-static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
+static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
 {
-	if (percpu) {
-		l->key_size = key_size;
-		call_rcu(&l->rcu, htab_percpu_elem_free_rcu);
+	if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
+		pcpu_freelist_push(&htab->freelist, &l->fnode);
 	} else {
-		kfree_rcu(l, rcu);
+		atomic_dec(&htab->count);
+		l->htab = htab;
+		call_rcu(&l->rcu, htab_elem_free_rcu);
 	}
 }
 
@@ -293,23 +384,39 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 					 bool percpu, bool onallcpus)
 {
 	u32 size = htab->map.value_size;
+	bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
 	struct htab_elem *l_new;
 	void __percpu *pptr;
 
-	l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
-	if (!l_new)
-		return NULL;
+	if (prealloc) {
+		l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
+		if (!l_new)
+			return ERR_PTR(-E2BIG);
+	} else {
+		if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+			atomic_dec(&htab->count);
+			return ERR_PTR(-E2BIG);
+		}
+		l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
+		if (!l_new)
+			return ERR_PTR(-ENOMEM);
+	}
 
 	memcpy(l_new->key, key, key_size);
 	if (percpu) {
 		/* round up value_size to 8 bytes */
 		size = round_up(size, 8);
 
-		/* alloc_percpu zero-fills */
-		pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN);
-		if (!pptr) {
-			kfree(l_new);
-			return NULL;
+		if (prealloc) {
+			pptr = htab_elem_get_ptr(l_new, key_size);
+		} else {
+			/* alloc_percpu zero-fills */
+			pptr = __alloc_percpu_gfp(size, 8,
+						  GFP_ATOMIC | __GFP_NOWARN);
+			if (!pptr) {
+				kfree(l_new);
+				return ERR_PTR(-ENOMEM);
+			}
 		}
 
 		if (!onallcpus) {
@@ -324,7 +431,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 				off += size;
 			}
 		}
-		htab_elem_set_ptr(l_new, key_size, pptr);
+		if (!prealloc)
+			htab_elem_set_ptr(l_new, key_size, pptr);
 	} else {
 		memcpy(l_new->key + round_up(key_size, 8), value, size);
 	}
@@ -336,12 +444,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
 static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
 		       u64 map_flags)
 {
-	if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries))
-		/* if elem with this 'key' doesn't exist and we've reached
-		 * max_entries limit, fail insertion of new elem
-		 */
-		return -E2BIG;
-
 	if (l_old && map_flags == BPF_NOEXIST)
 		/* elem already exists */
 		return -EEXIST;
@@ -375,13 +477,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 
 	hash = htab_map_hash(key, key_size);
 
-	/* allocate new element outside of the lock, since
-	 * we're most likley going to insert it
-	 */
-	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
-	if (!l_new)
-		return -ENOMEM;
-
 	b = __select_bucket(htab, hash);
 	head = &b->head;
 
@@ -394,21 +489,24 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
 	if (ret)
 		goto err;
 
+	l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
+	if (IS_ERR(l_new)) {
+		/* all pre-allocated elements are in use or memory exhausted */
+		ret = PTR_ERR(l_new);
+		goto err;
+	}
+
 	/* add new element to the head of the list, so that
 	 * concurrent search will find it before old elem
 	 */
 	hlist_add_head_rcu(&l_new->hash_node, head);
 	if (l_old) {
 		hlist_del_rcu(&l_old->hash_node);
-		kfree_rcu(l_old, rcu);
-	} else {
-		atomic_inc(&htab->count);
+		free_htab_elem(htab, l_old);
 	}
-	raw_spin_unlock_irqrestore(&b->lock, flags);
-	return 0;
+	ret = 0;
 err:
 	raw_spin_unlock_irqrestore(&b->lock, flags);
-	kfree(l_new);
 	return ret;
 }
 
@@ -466,12 +564,11 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 	} else {
 		l_new = alloc_htab_elem(htab, key, value, key_size,
 					hash, true, onallcpus);
-		if (!l_new) {
-			ret = -ENOMEM;
+		if (IS_ERR(l_new)) {
+			ret = PTR_ERR(l_new);
 			goto err;
 		}
 		hlist_add_head_rcu(&l_new->hash_node, head);
-		atomic_inc(&htab->count);
 	}
 	ret = 0;
 err:
@@ -489,7 +586,6 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
 static int htab_map_delete_elem(struct bpf_map *map, void *key)
 {
 	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH;
 	struct hlist_head *head;
 	struct bucket *b;
 	struct htab_elem *l;
@@ -511,8 +607,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
 
 	if (l) {
 		hlist_del_rcu(&l->hash_node);
-		atomic_dec(&htab->count);
-		free_htab_elem(l, percpu, key_size);
+		free_htab_elem(htab, l);
 		ret = 0;
 	}
 
@@ -531,17 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab)
 
 		hlist_for_each_entry_safe(l, n, head, hash_node) {
 			hlist_del_rcu(&l->hash_node);
-			atomic_dec(&htab->count);
-			if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) {
-				l->key_size = htab->map.key_size;
-				htab_percpu_elem_free(l);
-			} else {
-				kfree(l);
-			}
+			htab_elem_free(htab, l);
 		}
 	}
 }
-
 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
 static void htab_map_free(struct bpf_map *map)
 {
@@ -554,10 +642,16 @@ static void htab_map_free(struct bpf_map *map)
 	 */
 	synchronize_rcu();
 
-	/* some of kfree_rcu() callbacks for elements of this map may not have
-	 * executed. It's ok. Proceed to free residual elements and map itself
+	/* some of free_htab_elem() callbacks for elements of this map may
+	 * not have executed. Wait for them.
 	 */
-	delete_all_elements(htab);
+	rcu_barrier();
+	if (htab->map.map_flags & BPF_F_NO_PREALLOC) {
+		delete_all_elements(htab);
+	} else {
+		htab_free_elems(htab);
+		pcpu_freelist_destroy(&htab->freelist);
+	}
 	kvfree(htab->buckets);
 	kfree(htab);
 }
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index dc99f6a000f5..cbd94b2144ff 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -48,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
 	list_add(&tl->list_node, &bpf_map_types);
 }
 
+int bpf_map_precharge_memlock(u32 pages)
+{
+	struct user_struct *user = get_current_user();
+	unsigned long memlock_limit, cur;
+
+	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	cur = atomic_long_read(&user->locked_vm);
+	free_uid(user);
+	if (cur + pages > memlock_limit)
+		return -EPERM;
+	return 0;
+}
+
 static int bpf_map_charge_memlock(struct bpf_map *map)
 {
 	struct user_struct *user = get_current_user();
@@ -153,7 +166,7 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
-#define BPF_MAP_CREATE_LAST_FIELD max_entries
+#define BPF_MAP_CREATE_LAST_FIELD map_flags
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
-- 
cgit v1.2.3


From ab7ac4eb9832e32a09f4e8042705484d2fb0aad3 Mon Sep 17 00:00:00 2001
From: Tom Herbert <tom@herbertland.com>
Date: Mon, 7 Mar 2016 14:11:06 -0800
Subject: kcm: Kernel Connection Multiplexor module

This module implements the Kernel Connection Multiplexor.

Kernel Connection Multiplexor (KCM) is a facility that provides a
message based interface over TCP for generic application protocols.
With KCM an application can efficiently send and receive application
protocol messages over TCP using datagram sockets.

For more information see the included Documentation/networking/kcm.txt

Signed-off-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/socket.h   |    6 +-
 include/net/kcm.h        |  125 +++
 include/uapi/linux/kcm.h |   40 +
 net/Kconfig              |    1 +
 net/Makefile             |    1 +
 net/kcm/Kconfig          |   10 +
 net/kcm/Makefile         |    3 +
 net/kcm/kcmsock.c        | 2016 ++++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 2201 insertions(+), 1 deletion(-)
 create mode 100644 include/net/kcm.h
 create mode 100644 include/uapi/linux/kcm.h
 create mode 100644 net/kcm/Kconfig
 create mode 100644 net/kcm/Makefile
 create mode 100644 net/kcm/kcmsock.c

(limited to 'include/uapi/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index d834af22a460..73bf6c6a833b 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -200,7 +200,9 @@ struct ucred {
 #define AF_ALG		38	/* Algorithm sockets		*/
 #define AF_NFC		39	/* NFC sockets			*/
 #define AF_VSOCK	40	/* vSockets			*/
-#define AF_MAX		41	/* For now.. */
+#define AF_KCM		41	/* Kernel Connection Multiplexor*/
+
+#define AF_MAX		42	/* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC	AF_UNSPEC
@@ -246,6 +248,7 @@ struct ucred {
 #define PF_ALG		AF_ALG
 #define PF_NFC		AF_NFC
 #define PF_VSOCK	AF_VSOCK
+#define PF_KCM		AF_KCM
 #define PF_MAX		AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
@@ -323,6 +326,7 @@ struct ucred {
 #define SOL_CAIF	278
 #define SOL_ALG		279
 #define SOL_NFC		280
+#define SOL_KCM		281
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/include/net/kcm.h b/include/net/kcm.h
new file mode 100644
index 000000000000..1bcae39070ec
--- /dev/null
+++ b/include/net/kcm.h
@@ -0,0 +1,125 @@
+/*
+ * Kernel Connection Multiplexor
+ *
+ * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ */
+
+#ifndef __NET_KCM_H_
+#define __NET_KCM_H_
+
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <uapi/linux/kcm.h>
+
+extern unsigned int kcm_net_id;
+
+struct kcm_tx_msg {
+	unsigned int sent;
+	unsigned int fragidx;
+	unsigned int frag_offset;
+	unsigned int msg_flags;
+	struct sk_buff *frag_skb;
+	struct sk_buff *last_skb;
+};
+
+struct kcm_rx_msg {
+	int full_len;
+	int accum_len;
+	int offset;
+};
+
+/* Socket structure for KCM client sockets */
+struct kcm_sock {
+	struct sock sk;
+	struct kcm_mux *mux;
+	struct list_head kcm_sock_list;
+	int index;
+	u32 done : 1;
+	struct work_struct done_work;
+
+	/* Transmit */
+	struct kcm_psock *tx_psock;
+	struct work_struct tx_work;
+	struct list_head wait_psock_list;
+	struct sk_buff *seq_skb;
+
+	/* Don't use bit fields here, these are set under different locks */
+	bool tx_wait;
+	bool tx_wait_more;
+
+	/* Receive */
+	struct kcm_psock *rx_psock;
+	struct list_head wait_rx_list; /* KCMs waiting for receiving */
+	bool rx_wait;
+	u32 rx_disabled : 1;
+};
+
+struct bpf_prog;
+
+/* Structure for an attached lower socket */
+struct kcm_psock {
+	struct sock *sk;
+	struct kcm_mux *mux;
+	int index;
+
+	u32 tx_stopped : 1;
+	u32 rx_stopped : 1;
+	u32 done : 1;
+	u32 unattaching : 1;
+
+	void (*save_state_change)(struct sock *sk);
+	void (*save_data_ready)(struct sock *sk);
+	void (*save_write_space)(struct sock *sk);
+
+	struct list_head psock_list;
+
+	/* Receive */
+	struct sk_buff *rx_skb_head;
+	struct sk_buff **rx_skb_nextp;
+	struct sk_buff *ready_rx_msg;
+	struct list_head psock_ready_list;
+	struct work_struct rx_work;
+	struct delayed_work rx_delayed_work;
+	struct bpf_prog *bpf_prog;
+	struct kcm_sock *rx_kcm;
+
+	/* Transmit */
+	struct kcm_sock *tx_kcm;
+	struct list_head psock_avail_list;
+};
+
+/* Per net MUX list */
+struct kcm_net {
+	struct mutex mutex;
+	struct list_head mux_list;
+	int count;
+};
+
+/* Structure for a MUX */
+struct kcm_mux {
+	struct list_head kcm_mux_list;
+	struct rcu_head rcu;
+	struct kcm_net *knet;
+
+	struct list_head kcm_socks;	/* All KCM sockets on MUX */
+	int kcm_socks_cnt;		/* Total KCM socket count for MUX */
+	struct list_head psocks;	/* List of all psocks on MUX */
+	int psocks_cnt;		/* Total attached sockets */
+
+	/* Receive */
+	spinlock_t rx_lock ____cacheline_aligned_in_smp;
+	struct list_head kcm_rx_waiters; /* KCMs waiting for receiving */
+	struct list_head psocks_ready;	/* List of psocks with a msg ready */
+	struct sk_buff_head rx_hold_queue;
+
+	/* Transmit */
+	spinlock_t  lock ____cacheline_aligned_in_smp;	/* TX and mux locking */
+	struct list_head psocks_avail;	/* List of available psocks */
+	struct list_head kcm_tx_waiters; /* KCMs waiting for a TX psock */
+};
+
+#endif /* __NET_KCM_H_ */
diff --git a/include/uapi/linux/kcm.h b/include/uapi/linux/kcm.h
new file mode 100644
index 000000000000..a5a530940b99
--- /dev/null
+++ b/include/uapi/linux/kcm.h
@@ -0,0 +1,40 @@
+/*
+ * Kernel Connection Multiplexor
+ *
+ * Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * User API to clone KCM sockets and attach transport socket to a KCM
+ * multiplexor.
+ */
+
+#ifndef KCM_KERNEL_H
+#define KCM_KERNEL_H
+
+struct kcm_attach {
+	int fd;
+	int bpf_fd;
+};
+
+struct kcm_unattach {
+	int fd;
+};
+
+struct kcm_clone {
+	int fd;
+};
+
+#define SIOCKCMATTACH	(SIOCPROTOPRIVATE + 0)
+#define SIOCKCMUNATTACH	(SIOCPROTOPRIVATE + 1)
+#define SIOCKCMCLONE	(SIOCPROTOPRIVATE + 2)
+
+#define KCMPROTO_CONNECTED	0
+
+/* Socket options */
+#define KCM_RECV_DISABLE	1
+
+#endif
+
diff --git a/net/Kconfig b/net/Kconfig
index 2760825e53fa..10640d5f8bee 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -360,6 +360,7 @@ source "net/can/Kconfig"
 source "net/irda/Kconfig"
 source "net/bluetooth/Kconfig"
 source "net/rxrpc/Kconfig"
+source "net/kcm/Kconfig"
 
 config FIB_RULES
 	bool
diff --git a/net/Makefile b/net/Makefile
index a5d04098dfce..81d14119eab5 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_IRDA)		+= irda/
 obj-$(CONFIG_BT)		+= bluetooth/
 obj-$(CONFIG_SUNRPC)		+= sunrpc/
 obj-$(CONFIG_AF_RXRPC)		+= rxrpc/
+obj-$(CONFIG_AF_KCM)		+= kcm/
 obj-$(CONFIG_ATM)		+= atm/
 obj-$(CONFIG_L2TP)		+= l2tp/
 obj-$(CONFIG_DECNET)		+= decnet/
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
new file mode 100644
index 000000000000..5db94d940ecc
--- /dev/null
+++ b/net/kcm/Kconfig
@@ -0,0 +1,10 @@
+
+config AF_KCM
+	tristate "KCM sockets"
+	depends on INET
+	select BPF_SYSCALL
+	---help---
+	  KCM (Kernel Connection Multiplexor) sockets provide a method
+	  for multiplexing messages of a message based application
+	  protocol over kernel connectons (e.g. TCP connections).
+
diff --git a/net/kcm/Makefile b/net/kcm/Makefile
new file mode 100644
index 000000000000..cb525f7c5a13
--- /dev/null
+++ b/net/kcm/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_AF_KCM) += kcm.o
+
+kcm-y := kcmsock.o
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
new file mode 100644
index 000000000000..30ef69ac6b81
--- /dev/null
+++ b/net/kcm/kcmsock.c
@@ -0,0 +1,2016 @@
+#include <linux/bpf.h>
+#include <linux/errno.h>
+#include <linux/errqueue.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/poll.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/uaccess.h>
+#include <linux/workqueue.h>
+#include <net/kcm.h>
+#include <net/netns/generic.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <uapi/linux/kcm.h>
+
+unsigned int kcm_net_id;
+
+static struct kmem_cache *kcm_psockp __read_mostly;
+static struct kmem_cache *kcm_muxp __read_mostly;
+static struct workqueue_struct *kcm_wq;
+
+static inline struct kcm_sock *kcm_sk(const struct sock *sk)
+{
+	return (struct kcm_sock *)sk;
+}
+
+static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
+{
+	return (struct kcm_tx_msg *)skb->cb;
+}
+
+static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb)
+{
+	return (struct kcm_rx_msg *)((void *)skb->cb +
+				     offsetof(struct qdisc_skb_cb, data));
+}
+
+static void report_csk_error(struct sock *csk, int err)
+{
+	csk->sk_err = EPIPE;
+	csk->sk_error_report(csk);
+}
+
+/* Callback lock held */
+static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
+			       struct sk_buff *skb)
+{
+	struct sock *csk = psock->sk;
+
+	/* Unrecoverable error in receive */
+
+	if (psock->rx_stopped)
+		return;
+
+	psock->rx_stopped = 1;
+
+	/* Report an error on the lower socket */
+	report_csk_error(csk, err);
+}
+
+static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
+			       bool wakeup_kcm)
+{
+	struct sock *csk = psock->sk;
+	struct kcm_mux *mux = psock->mux;
+
+	/* Unrecoverable error in transmit */
+
+	spin_lock_bh(&mux->lock);
+
+	if (psock->tx_stopped) {
+		spin_unlock_bh(&mux->lock);
+		return;
+	}
+
+	psock->tx_stopped = 1;
+
+	if (!psock->tx_kcm) {
+		/* Take off psocks_avail list */
+		list_del(&psock->psock_avail_list);
+	} else if (wakeup_kcm) {
+		/* In this case psock is being aborted while outside of
+		 * write_msgs and psock is reserved. Schedule tx_work
+		 * to handle the failure there. Need to commit tx_stopped
+		 * before queuing work.
+		 */
+		smp_mb();
+
+		queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+	}
+
+	spin_unlock_bh(&mux->lock);
+
+	/* Report error on lower socket */
+	report_csk_error(csk, err);
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+/* KCM is ready to receive messages on its queue-- either the KCM is new or
+ * has become unblocked after being blocked on full socket buffer. Queue any
+ * pending ready messages on a psock. RX mux lock held.
+ */
+static void kcm_rcv_ready(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+	struct kcm_psock *psock;
+	struct sk_buff *skb;
+
+	if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled))
+		return;
+
+	while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) {
+		if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+			/* Assuming buffer limit has been reached */
+			skb_queue_head(&mux->rx_hold_queue, skb);
+			WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+			return;
+		}
+	}
+
+	while (!list_empty(&mux->psocks_ready)) {
+		psock = list_first_entry(&mux->psocks_ready, struct kcm_psock,
+					 psock_ready_list);
+
+		if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) {
+			/* Assuming buffer limit has been reached */
+			WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+			return;
+		}
+
+		/* Consumed the ready message on the psock. Schedule rx_work to
+		 * get more messages.
+		 */
+		list_del(&psock->psock_ready_list);
+		psock->ready_rx_msg = NULL;
+
+		/* Commit clearing of ready_rx_msg for queuing work */
+		smp_mb();
+
+		queue_work(kcm_wq, &psock->rx_work);
+	}
+
+	/* Buffer limit is okay now, add to ready list */
+	list_add_tail(&kcm->wait_rx_list,
+		      &kcm->mux->kcm_rx_waiters);
+	kcm->rx_wait = true;
+}
+
+static void kcm_rfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct kcm_sock *kcm = kcm_sk(sk);
+	struct kcm_mux *mux = kcm->mux;
+	unsigned int len = skb->truesize;
+
+	sk_mem_uncharge(sk, len);
+	atomic_sub(len, &sk->sk_rmem_alloc);
+
+	/* For reading rx_wait and rx_psock without holding lock */
+	smp_mb__after_atomic();
+
+	if (!kcm->rx_wait && !kcm->rx_psock &&
+	    sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
+		spin_lock_bh(&mux->rx_lock);
+		kcm_rcv_ready(kcm);
+		spin_unlock_bh(&mux->rx_lock);
+	}
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct sk_buff_head *list = &sk->sk_receive_queue;
+
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+		return -ENOMEM;
+
+	if (!sk_rmem_schedule(sk, skb, skb->truesize))
+		return -ENOBUFS;
+
+	skb->dev = NULL;
+
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = kcm_rfree;
+	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+	sk_mem_charge(sk, skb->truesize);
+
+	skb_queue_tail(list, skb);
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk);
+
+	return 0;
+}
+
+/* Requeue received messages for a kcm socket to other kcm sockets. This is
+ * called with a kcm socket is receive disabled.
+ * RX mux lock held.
+ */
+static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
+{
+	struct sk_buff *skb;
+	struct kcm_sock *kcm;
+
+	while ((skb = __skb_dequeue(head))) {
+		/* Reset destructor to avoid calling kcm_rcv_ready */
+		skb->destructor = sock_rfree;
+		skb_orphan(skb);
+try_again:
+		if (list_empty(&mux->kcm_rx_waiters)) {
+			skb_queue_tail(&mux->rx_hold_queue, skb);
+			continue;
+		}
+
+		kcm = list_first_entry(&mux->kcm_rx_waiters,
+				       struct kcm_sock, wait_rx_list);
+
+		if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+			/* Should mean socket buffer full */
+			list_del(&kcm->wait_rx_list);
+			kcm->rx_wait = false;
+
+			/* Commit rx_wait to read in kcm_free */
+			smp_wmb();
+
+			goto try_again;
+		}
+	}
+}
+
+/* Lower sock lock held */
+static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
+				       struct sk_buff *head)
+{
+	struct kcm_mux *mux = psock->mux;
+	struct kcm_sock *kcm;
+
+	WARN_ON(psock->ready_rx_msg);
+
+	if (psock->rx_kcm)
+		return psock->rx_kcm;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	if (psock->rx_kcm) {
+		spin_unlock_bh(&mux->rx_lock);
+		return psock->rx_kcm;
+	}
+
+	if (list_empty(&mux->kcm_rx_waiters)) {
+		psock->ready_rx_msg = head;
+		list_add_tail(&psock->psock_ready_list,
+			      &mux->psocks_ready);
+		spin_unlock_bh(&mux->rx_lock);
+		return NULL;
+	}
+
+	kcm = list_first_entry(&mux->kcm_rx_waiters,
+			       struct kcm_sock, wait_rx_list);
+	list_del(&kcm->wait_rx_list);
+	kcm->rx_wait = false;
+
+	psock->rx_kcm = kcm;
+	kcm->rx_psock = psock;
+
+	spin_unlock_bh(&mux->rx_lock);
+
+	return kcm;
+}
+
+static void kcm_done(struct kcm_sock *kcm);
+
+static void kcm_done_work(struct work_struct *w)
+{
+	kcm_done(container_of(w, struct kcm_sock, done_work));
+}
+
+/* Lower sock held */
+static void unreserve_rx_kcm(struct kcm_psock *psock,
+			     bool rcv_ready)
+{
+	struct kcm_sock *kcm = psock->rx_kcm;
+	struct kcm_mux *mux = psock->mux;
+
+	if (!kcm)
+		return;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	psock->rx_kcm = NULL;
+	kcm->rx_psock = NULL;
+
+	/* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
+	 * kcm_rfree
+	 */
+	smp_mb();
+
+	if (unlikely(kcm->done)) {
+		spin_unlock_bh(&mux->rx_lock);
+
+		/* Need to run kcm_done in a task since we need to qcquire
+		 * callback locks which may already be held here.
+		 */
+		INIT_WORK(&kcm->done_work, kcm_done_work);
+		schedule_work(&kcm->done_work);
+		return;
+	}
+
+	if (unlikely(kcm->rx_disabled)) {
+		requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+	} else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) {
+		/* Check for degenerative race with rx_wait that all
+		 * data was dequeued (accounted for in kcm_rfree).
+		 */
+		kcm_rcv_ready(kcm);
+	}
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+/* Macro to invoke filter function. */
+#define KCM_RUN_FILTER(prog, ctx) \
+	(*prog->bpf_func)(ctx, prog->insnsi)
+
+/* Lower socket lock held */
+static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
+			unsigned int orig_offset, size_t orig_len)
+{
+	struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data;
+	struct kcm_rx_msg *rxm;
+	struct kcm_sock *kcm;
+	struct sk_buff *head, *skb;
+	size_t eaten = 0, cand_len;
+	ssize_t extra;
+	int err;
+	bool cloned_orig = false;
+
+	if (psock->ready_rx_msg)
+		return 0;
+
+	head = psock->rx_skb_head;
+	if (head) {
+		/* Message already in progress */
+
+		if (unlikely(orig_offset)) {
+			/* Getting data with a non-zero offset when a message is
+			 * in progress is not expected. If it does happen, we
+			 * need to clone and pull since we can't deal with
+			 * offsets in the skbs for a message expect in the head.
+			 */
+			orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
+			if (!orig_skb) {
+				desc->error = -ENOMEM;
+				return 0;
+			}
+			if (!pskb_pull(orig_skb, orig_offset)) {
+				kfree_skb(orig_skb);
+				desc->error = -ENOMEM;
+				return 0;
+			}
+			cloned_orig = true;
+			orig_offset = 0;
+		}
+
+		if (!psock->rx_skb_nextp) {
+			/* We are going to append to the frags_list of head.
+			 * Need to unshare the frag_list.
+			 */
+			err = skb_unclone(head, GFP_ATOMIC);
+			if (err) {
+				desc->error = err;
+				return 0;
+			}
+
+			if (unlikely(skb_shinfo(head)->frag_list)) {
+				/* We can't append to an sk_buff that already
+				 * has a frag_list. We create a new head, point
+				 * the frag_list of that to the old head, and
+				 * then are able to use the old head->next for
+				 * appending to the message.
+				 */
+				if (WARN_ON(head->next)) {
+					desc->error = -EINVAL;
+					return 0;
+				}
+
+				skb = alloc_skb(0, GFP_ATOMIC);
+				if (!skb) {
+					desc->error = -ENOMEM;
+					return 0;
+				}
+				skb->len = head->len;
+				skb->data_len = head->len;
+				skb->truesize = head->truesize;
+				*kcm_rx_msg(skb) = *kcm_rx_msg(head);
+				psock->rx_skb_nextp = &head->next;
+				skb_shinfo(skb)->frag_list = head;
+				psock->rx_skb_head = skb;
+				head = skb;
+			} else {
+				psock->rx_skb_nextp =
+				    &skb_shinfo(head)->frag_list;
+			}
+		}
+	}
+
+	while (eaten < orig_len) {
+		/* Always clone since we will consume something */
+		skb = skb_clone(orig_skb, GFP_ATOMIC);
+		if (!skb) {
+			desc->error = -ENOMEM;
+			break;
+		}
+
+		cand_len = orig_len - eaten;
+
+		head = psock->rx_skb_head;
+		if (!head) {
+			head = skb;
+			psock->rx_skb_head = head;
+			/* Will set rx_skb_nextp on next packet if needed */
+			psock->rx_skb_nextp = NULL;
+			rxm = kcm_rx_msg(head);
+			memset(rxm, 0, sizeof(*rxm));
+			rxm->offset = orig_offset + eaten;
+		} else {
+			/* Unclone since we may be appending to an skb that we
+			 * already share a frag_list with.
+			 */
+			err = skb_unclone(skb, GFP_ATOMIC);
+			if (err) {
+				desc->error = err;
+				break;
+			}
+
+			rxm = kcm_rx_msg(head);
+			*psock->rx_skb_nextp = skb;
+			psock->rx_skb_nextp = &skb->next;
+			head->data_len += skb->len;
+			head->len += skb->len;
+			head->truesize += skb->truesize;
+		}
+
+		if (!rxm->full_len) {
+			ssize_t len;
+
+			len = KCM_RUN_FILTER(psock->bpf_prog, head);
+
+			if (!len) {
+				/* Need more header to determine length */
+				rxm->accum_len += cand_len;
+				eaten += cand_len;
+				WARN_ON(eaten != orig_len);
+				break;
+			} else if (len <= (ssize_t)head->len -
+					  skb->len - rxm->offset) {
+				/* Length must be into new skb (and also
+				 * greater than zero)
+				 */
+				desc->error = -EPROTO;
+				psock->rx_skb_head = NULL;
+				kcm_abort_rx_psock(psock, EPROTO, head);
+				break;
+			}
+
+			rxm->full_len = len;
+		}
+
+		extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len;
+
+		if (extra < 0) {
+			/* Message not complete yet. */
+			rxm->accum_len += cand_len;
+			eaten += cand_len;
+			WARN_ON(eaten != orig_len);
+			break;
+		}
+
+		/* Positive extra indicates ore bytes than needed for the
+		 * message
+		 */
+
+		WARN_ON(extra > cand_len);
+
+		eaten += (cand_len - extra);
+
+		/* Hurray, we have a new message! */
+		psock->rx_skb_head = NULL;
+
+try_queue:
+		kcm = reserve_rx_kcm(psock, head);
+		if (!kcm) {
+			/* Unable to reserve a KCM, message is held in psock. */
+			break;
+		}
+
+		if (kcm_queue_rcv_skb(&kcm->sk, head)) {
+			/* Should mean socket buffer full */
+			unreserve_rx_kcm(psock, false);
+			goto try_queue;
+		}
+	}
+
+	if (cloned_orig)
+		kfree_skb(orig_skb);
+
+	return eaten;
+}
+
+/* Called with lock held on lower socket */
+static int psock_tcp_read_sock(struct kcm_psock *psock)
+{
+	read_descriptor_t desc;
+
+	desc.arg.data = psock;
+	desc.error = 0;
+	desc.count = 1; /* give more than one skb per call */
+
+	/* sk should be locked here, so okay to do tcp_read_sock */
+	tcp_read_sock(psock->sk, &desc, kcm_tcp_recv);
+
+	unreserve_rx_kcm(psock, true);
+
+	return desc.error;
+}
+
+/* Lower sock lock held */
+static void psock_tcp_data_ready(struct sock *sk)
+{
+	struct kcm_psock *psock;
+
+	read_lock_bh(&sk->sk_callback_lock);
+
+	psock = (struct kcm_psock *)sk->sk_user_data;
+	if (unlikely(!psock || psock->rx_stopped))
+		goto out;
+
+	if (psock->ready_rx_msg)
+		goto out;
+
+	if (psock_tcp_read_sock(psock) == -ENOMEM)
+		queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void do_psock_rx_work(struct kcm_psock *psock)
+{
+	read_descriptor_t rd_desc;
+	struct sock *csk = psock->sk;
+
+	/* We need the read lock to synchronize with psock_tcp_data_ready. We
+	 * need the socket lock for calling tcp_read_sock.
+	 */
+	lock_sock(csk);
+	read_lock_bh(&csk->sk_callback_lock);
+
+	if (unlikely(csk->sk_user_data != psock))
+		goto out;
+
+	if (unlikely(psock->rx_stopped))
+		goto out;
+
+	if (psock->ready_rx_msg)
+		goto out;
+
+	rd_desc.arg.data = psock;
+
+	if (psock_tcp_read_sock(psock) == -ENOMEM)
+		queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+	read_unlock_bh(&csk->sk_callback_lock);
+	release_sock(csk);
+}
+
+static void psock_rx_work(struct work_struct *w)
+{
+	do_psock_rx_work(container_of(w, struct kcm_psock, rx_work));
+}
+
+static void psock_rx_delayed_work(struct work_struct *w)
+{
+	do_psock_rx_work(container_of(w, struct kcm_psock,
+				      rx_delayed_work.work));
+}
+
+static void psock_tcp_state_change(struct sock *sk)
+{
+	/* TCP only does a POLLIN for a half close. Do a POLLHUP here
+	 * since application will normally not poll with POLLIN
+	 * on the TCP sockets.
+	 */
+
+	report_csk_error(sk, EPIPE);
+}
+
+static void psock_tcp_write_space(struct sock *sk)
+{
+	struct kcm_psock *psock;
+	struct kcm_mux *mux;
+	struct kcm_sock *kcm;
+
+	read_lock_bh(&sk->sk_callback_lock);
+
+	psock = (struct kcm_psock *)sk->sk_user_data;
+	if (unlikely(!psock))
+		goto out;
+
+	mux = psock->mux;
+
+	spin_lock_bh(&mux->lock);
+
+	/* Check if the socket is reserved so someone is waiting for sending. */
+	kcm = psock->tx_kcm;
+	if (kcm)
+		queue_work(kcm_wq, &kcm->tx_work);
+
+	spin_unlock_bh(&mux->lock);
+out:
+	read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void unreserve_psock(struct kcm_sock *kcm);
+
+/* kcm sock is locked. */
+static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+	struct kcm_psock *psock;
+
+	psock = kcm->tx_psock;
+
+	smp_rmb(); /* Must read tx_psock before tx_wait */
+
+	if (psock) {
+		WARN_ON(kcm->tx_wait);
+		if (unlikely(psock->tx_stopped))
+			unreserve_psock(kcm);
+		else
+			return kcm->tx_psock;
+	}
+
+	spin_lock_bh(&mux->lock);
+
+	/* Check again under lock to see if psock was reserved for this
+	 * psock via psock_unreserve.
+	 */
+	psock = kcm->tx_psock;
+	if (unlikely(psock)) {
+		WARN_ON(kcm->tx_wait);
+		spin_unlock_bh(&mux->lock);
+		return kcm->tx_psock;
+	}
+
+	if (!list_empty(&mux->psocks_avail)) {
+		psock = list_first_entry(&mux->psocks_avail,
+					 struct kcm_psock,
+					 psock_avail_list);
+		list_del(&psock->psock_avail_list);
+		if (kcm->tx_wait) {
+			list_del(&kcm->wait_psock_list);
+			kcm->tx_wait = false;
+		}
+		kcm->tx_psock = psock;
+		psock->tx_kcm = kcm;
+	} else if (!kcm->tx_wait) {
+		list_add_tail(&kcm->wait_psock_list,
+			      &mux->kcm_tx_waiters);
+		kcm->tx_wait = true;
+	}
+
+	spin_unlock_bh(&mux->lock);
+
+	return psock;
+}
+
+/* mux lock held */
+static void psock_now_avail(struct kcm_psock *psock)
+{
+	struct kcm_mux *mux = psock->mux;
+	struct kcm_sock *kcm;
+
+	if (list_empty(&mux->kcm_tx_waiters)) {
+		list_add_tail(&psock->psock_avail_list,
+			      &mux->psocks_avail);
+	} else {
+		kcm = list_first_entry(&mux->kcm_tx_waiters,
+				       struct kcm_sock,
+				       wait_psock_list);
+		list_del(&kcm->wait_psock_list);
+		kcm->tx_wait = false;
+		psock->tx_kcm = kcm;
+
+		/* Commit before changing tx_psock since that is read in
+		 * reserve_psock before queuing work.
+		 */
+		smp_mb();
+
+		kcm->tx_psock = psock;
+		queue_work(kcm_wq, &kcm->tx_work);
+	}
+}
+
+/* kcm sock is locked. */
+static void unreserve_psock(struct kcm_sock *kcm)
+{
+	struct kcm_psock *psock;
+	struct kcm_mux *mux = kcm->mux;
+
+	spin_lock_bh(&mux->lock);
+
+	psock = kcm->tx_psock;
+
+	if (WARN_ON(!psock)) {
+		spin_unlock_bh(&mux->lock);
+		return;
+	}
+
+	smp_rmb(); /* Read tx_psock before tx_wait */
+
+	WARN_ON(kcm->tx_wait);
+
+	kcm->tx_psock = NULL;
+	psock->tx_kcm = NULL;
+
+	if (unlikely(psock->tx_stopped)) {
+		if (psock->done) {
+			/* Deferred free */
+			list_del(&psock->psock_list);
+			mux->psocks_cnt--;
+			sock_put(psock->sk);
+			fput(psock->sk->sk_socket->file);
+			kmem_cache_free(kcm_psockp, psock);
+		}
+
+		/* Don't put back on available list */
+
+		spin_unlock_bh(&mux->lock);
+
+		return;
+	}
+
+	psock_now_avail(psock);
+
+	spin_unlock_bh(&mux->lock);
+}
+
+/* Write any messages ready on the kcm socket.  Called with kcm sock lock
+ * held.  Return bytes actually sent or error.
+ */
+static int kcm_write_msgs(struct kcm_sock *kcm)
+{
+	struct sock *sk = &kcm->sk;
+	struct kcm_psock *psock;
+	struct sk_buff *skb, *head;
+	struct kcm_tx_msg *txm;
+	unsigned short fragidx, frag_offset;
+	unsigned int sent, total_sent = 0;
+	int ret = 0;
+
+	kcm->tx_wait_more = false;
+	psock = kcm->tx_psock;
+	if (unlikely(psock && psock->tx_stopped)) {
+		/* A reserved psock was aborted asynchronously. Unreserve
+		 * it and we'll retry the message.
+		 */
+		unreserve_psock(kcm);
+		if (skb_queue_empty(&sk->sk_write_queue))
+			return 0;
+
+		kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0;
+
+	} else if (skb_queue_empty(&sk->sk_write_queue)) {
+		return 0;
+	}
+
+	head = skb_peek(&sk->sk_write_queue);
+	txm = kcm_tx_msg(head);
+
+	if (txm->sent) {
+		/* Send of first skbuff in queue already in progress */
+		if (WARN_ON(!psock)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		sent = txm->sent;
+		frag_offset = txm->frag_offset;
+		fragidx = txm->fragidx;
+		skb = txm->frag_skb;
+
+		goto do_frag;
+	}
+
+try_again:
+	psock = reserve_psock(kcm);
+	if (!psock)
+		goto out;
+
+	do {
+		skb = head;
+		txm = kcm_tx_msg(head);
+		sent = 0;
+
+do_frag_list:
+		if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags;
+		     fragidx++) {
+			skb_frag_t *frag;
+
+			frag_offset = 0;
+do_frag:
+			frag = &skb_shinfo(skb)->frags[fragidx];
+			if (WARN_ON(!frag->size)) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			ret = kernel_sendpage(psock->sk->sk_socket,
+					      frag->page.p,
+					      frag->page_offset + frag_offset,
+					      frag->size - frag_offset,
+					      MSG_DONTWAIT);
+			if (ret <= 0) {
+				if (ret == -EAGAIN) {
+					/* Save state to try again when there's
+					 * write space on the socket
+					 */
+					txm->sent = sent;
+					txm->frag_offset = frag_offset;
+					txm->fragidx = fragidx;
+					txm->frag_skb = skb;
+
+					ret = 0;
+					goto out;
+				}
+
+				/* Hard failure in sending message, abort this
+				 * psock since it has lost framing
+				 * synchonization and retry sending the
+				 * message from the beginning.
+				 */
+				kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
+						   true);
+				unreserve_psock(kcm);
+
+				txm->sent = 0;
+				ret = 0;
+
+				goto try_again;
+			}
+
+			sent += ret;
+			frag_offset += ret;
+			if (frag_offset < frag->size) {
+				/* Not finished with this frag */
+				goto do_frag;
+			}
+		}
+
+		if (skb == head) {
+			if (skb_has_frag_list(skb)) {
+				skb = skb_shinfo(skb)->frag_list;
+				goto do_frag_list;
+			}
+		} else if (skb->next) {
+			skb = skb->next;
+			goto do_frag_list;
+		}
+
+		/* Successfully sent the whole packet, account for it. */
+		skb_dequeue(&sk->sk_write_queue);
+		kfree_skb(head);
+		sk->sk_wmem_queued -= sent;
+		total_sent += sent;
+	} while ((head = skb_peek(&sk->sk_write_queue)));
+out:
+	if (!head) {
+		/* Done with all queued messages. */
+		WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
+		unreserve_psock(kcm);
+	}
+
+	/* Check if write space is available */
+	sk->sk_write_space(sk);
+
+	return total_sent ? : ret;
+}
+
+static void kcm_tx_work(struct work_struct *w)
+{
+	struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work);
+	struct sock *sk = &kcm->sk;
+	int err;
+
+	lock_sock(sk);
+
+	/* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx
+	 * aborts
+	 */
+	err = kcm_write_msgs(kcm);
+	if (err < 0) {
+		/* Hard failure in write, report error on KCM socket */
+		pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err);
+		report_csk_error(&kcm->sk, -err);
+		goto out;
+	}
+
+	/* Primarily for SOCK_SEQPACKET sockets */
+	if (likely(sk->sk_socket) &&
+	    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		sk->sk_write_space(sk);
+	}
+
+out:
+	release_sock(sk);
+}
+
+static void kcm_push(struct kcm_sock *kcm)
+{
+	if (kcm->tx_wait_more)
+		kcm_write_msgs(kcm);
+}
+
+static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+	struct sock *sk = sock->sk;
+	struct kcm_sock *kcm = kcm_sk(sk);
+	struct sk_buff *skb = NULL, *head = NULL;
+	size_t copy, copied = 0;
+	long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+	int eor = (sock->type == SOCK_DGRAM) ?
+		  !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
+	int err = -EPIPE;
+
+	lock_sock(sk);
+
+	/* Per tcp_sendmsg this should be in poll */
+	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+	if (sk->sk_err)
+		goto out_error;
+
+	if (kcm->seq_skb) {
+		/* Previously opened message */
+		head = kcm->seq_skb;
+		skb = kcm_tx_msg(head)->last_skb;
+		goto start;
+	}
+
+	/* Call the sk_stream functions to manage the sndbuf mem. */
+	if (!sk_stream_memory_free(sk)) {
+		kcm_push(kcm);
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err)
+			goto out_error;
+	}
+
+	/* New message, alloc head skb */
+	head = alloc_skb(0, sk->sk_allocation);
+	while (!head) {
+		kcm_push(kcm);
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err)
+			goto out_error;
+
+		head = alloc_skb(0, sk->sk_allocation);
+	}
+
+	skb = head;
+
+	/* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
+	 * csum_and_copy_from_iter from skb_do_copy_data_nocache.
+	 */
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+start:
+	while (msg_data_left(msg)) {
+		bool merge = true;
+		int i = skb_shinfo(skb)->nr_frags;
+		struct page_frag *pfrag = sk_page_frag(sk);
+
+		if (!sk_page_frag_refill(sk, pfrag))
+			goto wait_for_memory;
+
+		if (!skb_can_coalesce(skb, i, pfrag->page,
+				      pfrag->offset)) {
+			if (i == MAX_SKB_FRAGS) {
+				struct sk_buff *tskb;
+
+				tskb = alloc_skb(0, sk->sk_allocation);
+				if (!tskb)
+					goto wait_for_memory;
+
+				if (head == skb)
+					skb_shinfo(head)->frag_list = tskb;
+				else
+					skb->next = tskb;
+
+				skb = tskb;
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+				continue;
+			}
+			merge = false;
+		}
+
+		copy = min_t(int, msg_data_left(msg),
+			     pfrag->size - pfrag->offset);
+
+		if (!sk_wmem_schedule(sk, copy))
+			goto wait_for_memory;
+
+		err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+					       pfrag->page,
+					       pfrag->offset,
+					       copy);
+		if (err)
+			goto out_error;
+
+		/* Update the skb. */
+		if (merge) {
+			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+		} else {
+			skb_fill_page_desc(skb, i, pfrag->page,
+					   pfrag->offset, copy);
+			get_page(pfrag->page);
+		}
+
+		pfrag->offset += copy;
+		copied += copy;
+		if (head != skb) {
+			head->len += copy;
+			head->data_len += copy;
+		}
+
+		continue;
+
+wait_for_memory:
+		kcm_push(kcm);
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err)
+			goto out_error;
+	}
+
+	if (eor) {
+		bool not_busy = skb_queue_empty(&sk->sk_write_queue);
+
+		/* Message complete, queue it on send buffer */
+		__skb_queue_tail(&sk->sk_write_queue, head);
+		kcm->seq_skb = NULL;
+
+		if (msg->msg_flags & MSG_BATCH) {
+			kcm->tx_wait_more = true;
+		} else if (kcm->tx_wait_more || not_busy) {
+			err = kcm_write_msgs(kcm);
+			if (err < 0) {
+				/* We got a hard error in write_msgs but have
+				 * already queued this message. Report an error
+				 * in the socket, but don't affect return value
+				 * from sendmsg
+				 */
+				pr_warn("KCM: Hard failure on kcm_write_msgs\n");
+				report_csk_error(&kcm->sk, -err);
+			}
+		}
+	} else {
+		/* Message not complete, save state */
+partial_message:
+		kcm->seq_skb = head;
+		kcm_tx_msg(head)->last_skb = skb;
+	}
+
+	release_sock(sk);
+	return copied;
+
+out_error:
+	kcm_push(kcm);
+
+	if (copied && sock->type == SOCK_SEQPACKET) {
+		/* Wrote some bytes before encountering an
+		 * error, return partial success.
+		 */
+		goto partial_message;
+	}
+
+	if (head != kcm->seq_skb)
+		kfree_skb(head);
+
+	err = sk_stream_error(sk, msg->msg_flags, err);
+
+	/* make sure we wake any epoll edge trigger waiter */
+	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+		sk->sk_write_space(sk);
+
+	release_sock(sk);
+	return err;
+}
+
+static struct sk_buff *kcm_wait_data(struct sock *sk, int flags,
+				     long timeo, int *err)
+{
+	struct sk_buff *skb;
+
+	while (!(skb = skb_peek(&sk->sk_receive_queue))) {
+		if (sk->sk_err) {
+			*err = sock_error(sk);
+			return NULL;
+		}
+
+		if (sock_flag(sk, SOCK_DONE))
+			return NULL;
+
+		if ((flags & MSG_DONTWAIT) || !timeo) {
+			*err = -EAGAIN;
+			return NULL;
+		}
+
+		sk_wait_data(sk, &timeo, NULL);
+
+		/* Handle signals */
+		if (signal_pending(current)) {
+			*err = sock_intr_errno(timeo);
+			return NULL;
+		}
+	}
+
+	return skb;
+}
+
+static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
+		       size_t len, int flags)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+	long timeo;
+	struct kcm_rx_msg *rxm;
+	int copied = 0;
+	struct sk_buff *skb;
+
+	timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+	lock_sock(sk);
+
+	skb = kcm_wait_data(sk, flags, timeo, &err);
+	if (!skb)
+		goto out;
+
+	/* Okay, have a message on the receive queue */
+
+	rxm = kcm_rx_msg(skb);
+
+	if (len > rxm->full_len)
+		len = rxm->full_len;
+
+	err = skb_copy_datagram_msg(skb, rxm->offset, msg, len);
+	if (err < 0)
+		goto out;
+
+	copied = len;
+	if (likely(!(flags & MSG_PEEK))) {
+		if (copied < rxm->full_len) {
+			if (sock->type == SOCK_DGRAM) {
+				/* Truncated message */
+				msg->msg_flags |= MSG_TRUNC;
+				goto msg_finished;
+			}
+			rxm->offset += copied;
+			rxm->full_len -= copied;
+		} else {
+msg_finished:
+			/* Finished with message */
+			msg->msg_flags |= MSG_EOR;
+			skb_unlink(skb, &sk->sk_receive_queue);
+			kfree_skb(skb);
+		}
+	}
+
+out:
+	release_sock(sk);
+
+	return copied ? : err;
+}
+
+/* kcm sock lock held */
+static void kcm_recv_disable(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+
+	if (kcm->rx_disabled)
+		return;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	kcm->rx_disabled = 1;
+
+	/* If a psock is reserved we'll do cleanup in unreserve */
+	if (!kcm->rx_psock) {
+		if (kcm->rx_wait) {
+			list_del(&kcm->wait_rx_list);
+			kcm->rx_wait = false;
+		}
+
+		requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+	}
+
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+/* kcm sock lock held */
+static void kcm_recv_enable(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+
+	if (!kcm->rx_disabled)
+		return;
+
+	spin_lock_bh(&mux->rx_lock);
+
+	kcm->rx_disabled = 0;
+	kcm_rcv_ready(kcm);
+
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+static int kcm_setsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, unsigned int optlen)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	int val, valbool;
+	int err = 0;
+
+	if (level != SOL_KCM)
+		return -ENOPROTOOPT;
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EINVAL;
+
+	valbool = val ? 1 : 0;
+
+	switch (optname) {
+	case KCM_RECV_DISABLE:
+		lock_sock(&kcm->sk);
+		if (valbool)
+			kcm_recv_disable(kcm);
+		else
+			kcm_recv_enable(kcm);
+		release_sock(&kcm->sk);
+		break;
+	default:
+		err = -ENOPROTOOPT;
+	}
+
+	return err;
+}
+
+static int kcm_getsockopt(struct socket *sock, int level, int optname,
+			  char __user *optval, int __user *optlen)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	int val, len;
+
+	if (level != SOL_KCM)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case KCM_RECV_DISABLE:
+		val = kcm->rx_disabled;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	}
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+
+static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
+{
+	struct kcm_sock *tkcm;
+	struct list_head *head;
+	int index = 0;
+
+	/* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
+	 * we set sk_state, otherwise epoll_wait always returns right away with
+	 * POLLHUP
+	 */
+	kcm->sk.sk_state = TCP_ESTABLISHED;
+
+	/* Add to mux's kcm sockets list */
+	kcm->mux = mux;
+	spin_lock_bh(&mux->lock);
+
+	head = &mux->kcm_socks;
+	list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) {
+		if (tkcm->index != index)
+			break;
+		head = &tkcm->kcm_sock_list;
+		index++;
+	}
+
+	list_add(&kcm->kcm_sock_list, head);
+	kcm->index = index;
+
+	mux->kcm_socks_cnt++;
+	spin_unlock_bh(&mux->lock);
+
+	INIT_WORK(&kcm->tx_work, kcm_tx_work);
+
+	spin_lock_bh(&mux->rx_lock);
+	kcm_rcv_ready(kcm);
+	spin_unlock_bh(&mux->rx_lock);
+}
+
+static int kcm_attach(struct socket *sock, struct socket *csock,
+		      struct bpf_prog *prog)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	struct kcm_mux *mux = kcm->mux;
+	struct sock *csk;
+	struct kcm_psock *psock = NULL, *tpsock;
+	struct list_head *head;
+	int index = 0;
+
+	if (csock->ops->family != PF_INET &&
+	    csock->ops->family != PF_INET6)
+		return -EINVAL;
+
+	csk = csock->sk;
+	if (!csk)
+		return -EINVAL;
+
+	/* Only support TCP for now */
+	if (csk->sk_protocol != IPPROTO_TCP)
+		return -EINVAL;
+
+	psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
+	if (!psock)
+		return -ENOMEM;
+
+	psock->mux = mux;
+	psock->sk = csk;
+	psock->bpf_prog = prog;
+	INIT_WORK(&psock->rx_work, psock_rx_work);
+	INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work);
+
+	sock_hold(csk);
+
+	write_lock_bh(&csk->sk_callback_lock);
+	psock->save_data_ready = csk->sk_data_ready;
+	psock->save_write_space = csk->sk_write_space;
+	psock->save_state_change = csk->sk_state_change;
+	csk->sk_user_data = psock;
+	csk->sk_data_ready = psock_tcp_data_ready;
+	csk->sk_write_space = psock_tcp_write_space;
+	csk->sk_state_change = psock_tcp_state_change;
+	write_unlock_bh(&csk->sk_callback_lock);
+
+	/* Finished initialization, now add the psock to the MUX. */
+	spin_lock_bh(&mux->lock);
+	head = &mux->psocks;
+	list_for_each_entry(tpsock, &mux->psocks, psock_list) {
+		if (tpsock->index != index)
+			break;
+		head = &tpsock->psock_list;
+		index++;
+	}
+
+	list_add(&psock->psock_list, head);
+	psock->index = index;
+
+	mux->psocks_cnt++;
+	psock_now_avail(psock);
+	spin_unlock_bh(&mux->lock);
+
+	/* Schedule RX work in case there are already bytes queued */
+	queue_work(kcm_wq, &psock->rx_work);
+
+	return 0;
+}
+
+static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
+{
+	struct socket *csock;
+	struct bpf_prog *prog;
+	int err;
+
+	csock = sockfd_lookup(info->fd, &err);
+	if (!csock)
+		return -ENOENT;
+
+	prog = bpf_prog_get(info->bpf_fd);
+	if (IS_ERR(prog)) {
+		err = PTR_ERR(prog);
+		goto out;
+	}
+
+	if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
+		bpf_prog_put(prog);
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = kcm_attach(sock, csock, prog);
+	if (err) {
+		bpf_prog_put(prog);
+		goto out;
+	}
+
+	/* Keep reference on file also */
+
+	return 0;
+out:
+	fput(csock->file);
+	return err;
+}
+
+static void kcm_unattach(struct kcm_psock *psock)
+{
+	struct sock *csk = psock->sk;
+	struct kcm_mux *mux = psock->mux;
+
+	/* Stop getting callbacks from TCP socket. After this there should
+	 * be no way to reserve a kcm for this psock.
+	 */
+	write_lock_bh(&csk->sk_callback_lock);
+	csk->sk_user_data = NULL;
+	csk->sk_data_ready = psock->save_data_ready;
+	csk->sk_write_space = psock->save_write_space;
+	csk->sk_state_change = psock->save_state_change;
+	psock->rx_stopped = 1;
+
+	if (WARN_ON(psock->rx_kcm)) {
+		write_unlock_bh(&csk->sk_callback_lock);
+		return;
+	}
+
+	spin_lock_bh(&mux->rx_lock);
+
+	/* Stop receiver activities. After this point psock should not be
+	 * able to get onto ready list either through callbacks or work.
+	 */
+	if (psock->ready_rx_msg) {
+		list_del(&psock->psock_ready_list);
+		kfree_skb(psock->ready_rx_msg);
+		psock->ready_rx_msg = NULL;
+	}
+
+	spin_unlock_bh(&mux->rx_lock);
+
+	write_unlock_bh(&csk->sk_callback_lock);
+
+	cancel_work_sync(&psock->rx_work);
+	cancel_delayed_work_sync(&psock->rx_delayed_work);
+
+	bpf_prog_put(psock->bpf_prog);
+
+	kfree_skb(psock->rx_skb_head);
+	psock->rx_skb_head = NULL;
+
+	spin_lock_bh(&mux->lock);
+
+	if (psock->tx_kcm) {
+		/* psock was reserved.  Just mark it finished and we will clean
+		 * up in the kcm paths, we need kcm lock which can not be
+		 * acquired here.
+		 */
+		spin_unlock_bh(&mux->lock);
+
+		/* We are unattaching a socket that is reserved. Abort the
+		 * socket since we may be out of sync in sending on it. We need
+		 * to do this without the mux lock.
+		 */
+		kcm_abort_tx_psock(psock, EPIPE, false);
+
+		spin_lock_bh(&mux->lock);
+		if (!psock->tx_kcm) {
+			/* psock now unreserved in window mux was unlocked */
+			goto no_reserved;
+		}
+		psock->done = 1;
+
+		/* Commit done before queuing work to process it */
+		smp_mb();
+
+		/* Queue tx work to make sure psock->done is handled */
+		queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+		spin_unlock_bh(&mux->lock);
+	} else {
+no_reserved:
+		if (!psock->tx_stopped)
+			list_del(&psock->psock_avail_list);
+		list_del(&psock->psock_list);
+		mux->psocks_cnt--;
+		spin_unlock_bh(&mux->lock);
+
+		sock_put(csk);
+		fput(csk->sk_socket->file);
+		kmem_cache_free(kcm_psockp, psock);
+	}
+}
+
+static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
+{
+	struct kcm_sock *kcm = kcm_sk(sock->sk);
+	struct kcm_mux *mux = kcm->mux;
+	struct kcm_psock *psock;
+	struct socket *csock;
+	struct sock *csk;
+	int err;
+
+	csock = sockfd_lookup(info->fd, &err);
+	if (!csock)
+		return -ENOENT;
+
+	csk = csock->sk;
+	if (!csk) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = -ENOENT;
+
+	spin_lock_bh(&mux->lock);
+
+	list_for_each_entry(psock, &mux->psocks, psock_list) {
+		if (psock->sk != csk)
+			continue;
+
+		/* Found the matching psock */
+
+		if (psock->unattaching || WARN_ON(psock->done)) {
+			err = -EALREADY;
+			break;
+		}
+
+		psock->unattaching = 1;
+
+		spin_unlock_bh(&mux->lock);
+
+		kcm_unattach(psock);
+
+		err = 0;
+		goto out;
+	}
+
+	spin_unlock_bh(&mux->lock);
+
+out:
+	fput(csock->file);
+	return err;
+}
+
+static struct proto kcm_proto = {
+	.name	= "KCM",
+	.owner	= THIS_MODULE,
+	.obj_size = sizeof(struct kcm_sock),
+};
+
+/* Clone a kcm socket. */
+static int kcm_clone(struct socket *osock, struct kcm_clone *info,
+		     struct socket **newsockp)
+{
+	struct socket *newsock;
+	struct sock *newsk;
+	struct file *newfile;
+	int err, newfd;
+
+	err = -ENFILE;
+	newsock = sock_alloc();
+	if (!newsock)
+		goto out;
+
+	newsock->type = osock->type;
+	newsock->ops = osock->ops;
+
+	__module_get(newsock->ops->owner);
+
+	newfd = get_unused_fd_flags(0);
+	if (unlikely(newfd < 0)) {
+		err = newfd;
+		goto out_fd_fail;
+	}
+
+	newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
+	if (unlikely(IS_ERR(newfile))) {
+		err = PTR_ERR(newfile);
+		goto out_sock_alloc_fail;
+	}
+
+	newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
+			 &kcm_proto, true);
+	if (!newsk) {
+		err = -ENOMEM;
+		goto out_sk_alloc_fail;
+	}
+
+	sock_init_data(newsock, newsk);
+	init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
+
+	fd_install(newfd, newfile);
+	*newsockp = newsock;
+	info->fd = newfd;
+
+	return 0;
+
+out_sk_alloc_fail:
+	fput(newfile);
+out_sock_alloc_fail:
+	put_unused_fd(newfd);
+out_fd_fail:
+	sock_release(newsock);
+out:
+	return err;
+}
+
+static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	switch (cmd) {
+	case SIOCKCMATTACH: {
+		struct kcm_attach info;
+
+		if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+			err = -EFAULT;
+
+		err = kcm_attach_ioctl(sock, &info);
+
+		break;
+	}
+	case SIOCKCMUNATTACH: {
+		struct kcm_unattach info;
+
+		if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+			err = -EFAULT;
+
+		err = kcm_unattach_ioctl(sock, &info);
+
+		break;
+	}
+	case SIOCKCMCLONE: {
+		struct kcm_clone info;
+		struct socket *newsock = NULL;
+
+		if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+			err = -EFAULT;
+
+		err = kcm_clone(sock, &info, &newsock);
+
+		if (!err) {
+			if (copy_to_user((void __user *)arg, &info,
+					 sizeof(info))) {
+				err = -EFAULT;
+				sock_release(newsock);
+			}
+		}
+
+		break;
+	}
+	default:
+		err = -ENOIOCTLCMD;
+		break;
+	}
+
+	return err;
+}
+
+static void free_mux(struct rcu_head *rcu)
+{
+	struct kcm_mux *mux = container_of(rcu,
+	    struct kcm_mux, rcu);
+
+	kmem_cache_free(kcm_muxp, mux);
+}
+
+static void release_mux(struct kcm_mux *mux)
+{
+	struct kcm_net *knet = mux->knet;
+	struct kcm_psock *psock, *tmp_psock;
+
+	/* Release psocks */
+	list_for_each_entry_safe(psock, tmp_psock,
+				 &mux->psocks, psock_list) {
+		if (!WARN_ON(psock->unattaching))
+			kcm_unattach(psock);
+	}
+
+	if (WARN_ON(mux->psocks_cnt))
+		return;
+
+	__skb_queue_purge(&mux->rx_hold_queue);
+
+	mutex_lock(&knet->mutex);
+	list_del_rcu(&mux->kcm_mux_list);
+	knet->count--;
+	mutex_unlock(&knet->mutex);
+
+	call_rcu(&mux->rcu, free_mux);
+}
+
+static void kcm_done(struct kcm_sock *kcm)
+{
+	struct kcm_mux *mux = kcm->mux;
+	struct sock *sk = &kcm->sk;
+	int socks_cnt;
+
+	spin_lock_bh(&mux->rx_lock);
+	if (kcm->rx_psock) {
+		/* Cleanup in unreserve_rx_kcm */
+		WARN_ON(kcm->done);
+		kcm->rx_disabled = 1;
+		kcm->done = 1;
+		spin_unlock_bh(&mux->rx_lock);
+		return;
+	}
+
+	if (kcm->rx_wait) {
+		list_del(&kcm->wait_rx_list);
+		kcm->rx_wait = false;
+	}
+	/* Move any pending receive messages to other kcm sockets */
+	requeue_rx_msgs(mux, &sk->sk_receive_queue);
+
+	spin_unlock_bh(&mux->rx_lock);
+
+	if (WARN_ON(sk_rmem_alloc_get(sk)))
+		return;
+
+	/* Detach from MUX */
+	spin_lock_bh(&mux->lock);
+
+	list_del(&kcm->kcm_sock_list);
+	mux->kcm_socks_cnt--;
+	socks_cnt = mux->kcm_socks_cnt;
+
+	spin_unlock_bh(&mux->lock);
+
+	if (!socks_cnt) {
+		/* We are done with the mux now. */
+		release_mux(mux);
+	}
+
+	WARN_ON(kcm->rx_wait);
+
+	sock_put(&kcm->sk);
+}
+
+/* Called by kcm_release to close a KCM socket.
+ * If this is the last KCM socket on the MUX, destroy the MUX.
+ */
+static int kcm_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct kcm_sock *kcm;
+	struct kcm_mux *mux;
+	struct kcm_psock *psock;
+
+	if (!sk)
+		return 0;
+
+	kcm = kcm_sk(sk);
+	mux = kcm->mux;
+
+	sock_orphan(sk);
+	kfree_skb(kcm->seq_skb);
+
+	lock_sock(sk);
+	/* Purge queue under lock to avoid race condition with tx_work trying
+	 * to act when queue is nonempty. If tx_work runs after this point
+	 * it will just return.
+	 */
+	__skb_queue_purge(&sk->sk_write_queue);
+	release_sock(sk);
+
+	spin_lock_bh(&mux->lock);
+	if (kcm->tx_wait) {
+		/* Take of tx_wait list, after this point there should be no way
+		 * that a psock will be assigned to this kcm.
+		 */
+		list_del(&kcm->wait_psock_list);
+		kcm->tx_wait = false;
+	}
+	spin_unlock_bh(&mux->lock);
+
+	/* Cancel work. After this point there should be no outside references
+	 * to the kcm socket.
+	 */
+	cancel_work_sync(&kcm->tx_work);
+
+	lock_sock(sk);
+	psock = kcm->tx_psock;
+	if (psock) {
+		/* A psock was reserved, so we need to kill it since it
+		 * may already have some bytes queued from a message. We
+		 * need to do this after removing kcm from tx_wait list.
+		 */
+		kcm_abort_tx_psock(psock, EPIPE, false);
+		unreserve_psock(kcm);
+	}
+	release_sock(sk);
+
+	WARN_ON(kcm->tx_wait);
+	WARN_ON(kcm->tx_psock);
+
+	sock->sk = NULL;
+
+	kcm_done(kcm);
+
+	return 0;
+}
+
+static const struct proto_ops kcm_ops = {
+	.family =	PF_KCM,
+	.owner =	THIS_MODULE,
+	.release =	kcm_release,
+	.bind =		sock_no_bind,
+	.connect =	sock_no_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	sock_no_getname,
+	.poll =		datagram_poll,
+	.ioctl =	kcm_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	sock_no_shutdown,
+	.setsockopt =	kcm_setsockopt,
+	.getsockopt =	kcm_getsockopt,
+	.sendmsg =	kcm_sendmsg,
+	.recvmsg =	kcm_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	sock_no_sendpage,
+};
+
+/* Create proto operation for kcm sockets */
+static int kcm_create(struct net *net, struct socket *sock,
+		      int protocol, int kern)
+{
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+	struct sock *sk;
+	struct kcm_mux *mux;
+
+	switch (sock->type) {
+	case SOCK_DGRAM:
+	case SOCK_SEQPACKET:
+		sock->ops = &kcm_ops;
+		break;
+	default:
+		return -ESOCKTNOSUPPORT;
+	}
+
+	if (protocol != KCMPROTO_CONNECTED)
+		return -EPROTONOSUPPORT;
+
+	sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern);
+	if (!sk)
+		return -ENOMEM;
+
+	/* Allocate a kcm mux, shared between KCM sockets */
+	mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL);
+	if (!mux) {
+		sk_free(sk);
+		return -ENOMEM;
+	}
+
+	spin_lock_init(&mux->lock);
+	spin_lock_init(&mux->rx_lock);
+	INIT_LIST_HEAD(&mux->kcm_socks);
+	INIT_LIST_HEAD(&mux->kcm_rx_waiters);
+	INIT_LIST_HEAD(&mux->kcm_tx_waiters);
+
+	INIT_LIST_HEAD(&mux->psocks);
+	INIT_LIST_HEAD(&mux->psocks_ready);
+	INIT_LIST_HEAD(&mux->psocks_avail);
+
+	mux->knet = knet;
+
+	/* Add new MUX to list */
+	mutex_lock(&knet->mutex);
+	list_add_rcu(&mux->kcm_mux_list, &knet->mux_list);
+	knet->count++;
+	mutex_unlock(&knet->mutex);
+
+	skb_queue_head_init(&mux->rx_hold_queue);
+
+	/* Init KCM socket */
+	sock_init_data(sock, sk);
+	init_kcm_sock(kcm_sk(sk), mux);
+
+	return 0;
+}
+
+static struct net_proto_family kcm_family_ops = {
+	.family = PF_KCM,
+	.create = kcm_create,
+	.owner  = THIS_MODULE,
+};
+
+static __net_init int kcm_init_net(struct net *net)
+{
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	INIT_LIST_HEAD_RCU(&knet->mux_list);
+	mutex_init(&knet->mutex);
+
+	return 0;
+}
+
+static __net_exit void kcm_exit_net(struct net *net)
+{
+	struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+	/* All KCM sockets should be closed at this point, which should mean
+	 * that all multiplexors and psocks have been destroyed.
+	 */
+	WARN_ON(!list_empty(&knet->mux_list));
+}
+
+static struct pernet_operations kcm_net_ops = {
+	.init = kcm_init_net,
+	.exit = kcm_exit_net,
+	.id   = &kcm_net_id,
+	.size = sizeof(struct kcm_net),
+};
+
+static int __init kcm_init(void)
+{
+	int err = -ENOMEM;
+
+	kcm_muxp = kmem_cache_create("kcm_mux_cache",
+				     sizeof(struct kcm_mux), 0,
+				     SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	if (!kcm_muxp)
+		goto fail;
+
+	kcm_psockp = kmem_cache_create("kcm_psock_cache",
+				       sizeof(struct kcm_psock), 0,
+					SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+	if (!kcm_psockp)
+		goto fail;
+
+	kcm_wq = create_singlethread_workqueue("kkcmd");
+	if (!kcm_wq)
+		goto fail;
+
+	err = proto_register(&kcm_proto, 1);
+	if (err)
+		goto fail;
+
+	err = sock_register(&kcm_family_ops);
+	if (err)
+		goto sock_register_fail;
+
+	err = register_pernet_device(&kcm_net_ops);
+	if (err)
+		goto net_ops_fail;
+
+	return 0;
+
+net_ops_fail:
+	sock_unregister(PF_KCM);
+
+sock_register_fail:
+	proto_unregister(&kcm_proto);
+
+fail:
+	kmem_cache_destroy(kcm_muxp);
+	kmem_cache_destroy(kcm_psockp);
+
+	if (kcm_wq)
+		destroy_workqueue(kcm_wq);
+
+	return err;
+}
+
+static void __exit kcm_exit(void)
+{
+	unregister_pernet_device(&kcm_net_ops);
+	sock_unregister(PF_KCM);
+	proto_unregister(&kcm_proto);
+	destroy_workqueue(kcm_wq);
+
+	kmem_cache_destroy(kcm_muxp);
+	kmem_cache_destroy(kcm_psockp);
+}
+
+module_init(kcm_init);
+module_exit(kcm_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_KCM);
+
-- 
cgit v1.2.3


From fae98164463948eb31d638fe207c25a3d2b9312f Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 10 Mar 2016 15:48:06 +0700
Subject: gpio: uapi: use 0xB4 as ioctl() major

The previous 'o' is in conflict and not very orderly assigned.
We want to select an ioctl() major that does not conflict with
the existining ones.

Add the new reserved major (0xB4) to Documentation/ioctl/ioctl-number.txt

Fixes: 3c702e9987e2 ("gpio: add a userspace chardev ABI for GPIOs")
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/ioctl/ioctl-number.txt | 1 +
 include/uapi/linux/gpio.h            | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 91261a32a573..9369d3b0f09a 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -319,6 +319,7 @@ Code  Seq#(hex)	Include File		Comments
 					<mailto:vgo@ratio.de>
 0xB1	00-1F	PPPoX			<mailto:mostrows@styx.uwaterloo.ca>
 0xB3	00	linux/mmc/ioctl.h
+0xB4	00-0F	linux/gpio.h		<mailto:linux-gpio@vger.kernel.org>
 0xC0	00-0F	linux/usb/iowarrior.h
 0xCA	00-0F	uapi/misc/cxl.h
 0xCA	80-8F	uapi/scsi/cxlflash_ioctl.h
diff --git a/include/uapi/linux/gpio.h b/include/uapi/linux/gpio.h
index dfe8ade2742d..d0a3cac72250 100644
--- a/include/uapi/linux/gpio.h
+++ b/include/uapi/linux/gpio.h
@@ -52,7 +52,7 @@ struct gpioline_info {
 	char consumer[32];
 };
 
-#define GPIO_GET_CHIPINFO_IOCTL _IOR('o', 0x01, struct gpiochip_info)
-#define GPIO_GET_LINEINFO_IOCTL _IOWR('o', 0x02, struct gpioline_info)
+#define GPIO_GET_CHIPINFO_IOCTL _IOR(0xB4, 0x01, struct gpiochip_info)
+#define GPIO_GET_LINEINFO_IOCTL _IOWR(0xB4, 0x02, struct gpioline_info)
 
 #endif /* _UAPI_GPIO_H_ */
-- 
cgit v1.2.3


From 5b33f48842fa1e13e9c0ea8cc59c1d0df19042db Mon Sep 17 00:00:00 2001
From: Amir Vadai <amir@vadai.me>
Date: Tue, 8 Mar 2016 12:42:29 +0200
Subject: net/flower: Introduce hardware offload support

This patch is based on a patch made by John Fastabend.
It adds support for offloading cls_flower.
when NETIF_F_HW_TC is on:
  flags = 0       => Rule will be processed twice - by hardware, and if
                     still relevant, by software.
  flags = SKIP_HW => Rull will be processed by software only

If hardware fail/not capabale to apply the rule, operation will NOT
fail. Filter will be processed by SW only.

Acked-by: Jiri Pirko <jiri@mellanox.com>
Suggested-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: Amir Vadai <amir@vadai.me>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h    |  2 ++
 include/net/pkt_cls.h        | 14 ++++++++++
 include/uapi/linux/pkt_cls.h |  2 ++
 net/sched/cls_flower.c       | 64 +++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 81 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fd30cb545c45..41df0b450757 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -786,6 +786,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
 enum {
 	TC_SETUP_MQPRIO,
 	TC_SETUP_CLSU32,
+	TC_SETUP_CLSFLOWER,
 };
 
 struct tc_cls_u32_offload;
@@ -795,6 +796,7 @@ struct tc_to_netdev {
 	union {
 		u8 tc;
 		struct tc_cls_u32_offload *cls_u32;
+		struct tc_cls_flower_offload *cls_flower;
 	};
 };
 
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index bea14eee373e..5b4e8f08b8f0 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -409,4 +409,18 @@ static inline bool tc_should_offload(struct net_device *dev, u32 flags)
 	return true;
 }
 
+enum tc_fl_command {
+	TC_CLSFLOWER_REPLACE,
+	TC_CLSFLOWER_DESTROY,
+};
+
+struct tc_cls_flower_offload {
+	enum tc_fl_command command;
+	u64 cookie;
+	struct flow_dissector *dissector;
+	struct fl_flow_key *mask;
+	struct fl_flow_key *key;
+	struct tcf_exts *exts;
+};
+
 #endif
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 9874f5680926..c43c5f78b9c4 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -417,6 +417,8 @@ enum {
 	TCA_FLOWER_KEY_TCP_DST,		/* be16 */
 	TCA_FLOWER_KEY_UDP_SRC,		/* be16 */
 	TCA_FLOWER_KEY_UDP_DST,		/* be16 */
+
+	TCA_FLOWER_FLAGS,
 	__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 95b021243233..25d87666bf1e 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -165,6 +165,51 @@ static void fl_destroy_filter(struct rcu_head *head)
 	kfree(f);
 }
 
+static void fl_hw_destroy_filter(struct tcf_proto *tp, u64 cookie)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_flower_offload offload = {0};
+	struct tc_to_netdev tc;
+
+	if (!tc_should_offload(dev, 0))
+		return;
+
+	offload.command = TC_CLSFLOWER_DESTROY;
+	offload.cookie = cookie;
+
+	tc.type = TC_SETUP_CLSFLOWER;
+	tc.cls_flower = &offload;
+
+	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
+static void fl_hw_replace_filter(struct tcf_proto *tp,
+				 struct flow_dissector *dissector,
+				 struct fl_flow_key *mask,
+				 struct fl_flow_key *key,
+				 struct tcf_exts *actions,
+				 u64 cookie, u32 flags)
+{
+	struct net_device *dev = tp->q->dev_queue->dev;
+	struct tc_cls_flower_offload offload = {0};
+	struct tc_to_netdev tc;
+
+	if (!tc_should_offload(dev, flags))
+		return;
+
+	offload.command = TC_CLSFLOWER_REPLACE;
+	offload.cookie = cookie;
+	offload.dissector = dissector;
+	offload.mask = mask;
+	offload.key = key;
+	offload.exts = actions;
+
+	tc.type = TC_SETUP_CLSFLOWER;
+	tc.cls_flower = &offload;
+
+	dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
 static bool fl_destroy(struct tcf_proto *tp, bool force)
 {
 	struct cls_fl_head *head = rtnl_dereference(tp->root);
@@ -174,6 +219,7 @@ static bool fl_destroy(struct tcf_proto *tp, bool force)
 		return false;
 
 	list_for_each_entry_safe(f, next, &head->filters, list) {
+		fl_hw_destroy_filter(tp, (u64)f);
 		list_del_rcu(&f->list);
 		call_rcu(&f->rcu, fl_destroy_filter);
 	}
@@ -459,6 +505,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	struct cls_fl_filter *fnew;
 	struct nlattr *tb[TCA_FLOWER_MAX + 1];
 	struct fl_flow_mask mask = {};
+	u32 flags = 0;
 	int err;
 
 	if (!tca[TCA_OPTIONS])
@@ -486,6 +533,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 	}
 	fnew->handle = handle;
 
+	if (tb[TCA_FLOWER_FLAGS])
+		flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
+
 	err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr);
 	if (err)
 		goto errout;
@@ -498,9 +548,20 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
 				     head->ht_params);
 	if (err)
 		goto errout;
-	if (fold)
+
+	fl_hw_replace_filter(tp,
+			     &head->dissector,
+			     &mask.key,
+			     &fnew->key,
+			     &fnew->exts,
+			     (u64)fnew,
+			     flags);
+
+	if (fold) {
 		rhashtable_remove_fast(&head->ht, &fold->ht_node,
 				       head->ht_params);
+		fl_hw_destroy_filter(tp, (u64)fold);
+	}
 
 	*arg = (unsigned long) fnew;
 
@@ -527,6 +588,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
 	rhashtable_remove_fast(&head->ht, &f->ht_node,
 			       head->ht_params);
 	list_del_rcu(&f->list);
+	fl_hw_destroy_filter(tp, (u64)f);
 	tcf_unbind_filter(tp, &f->res);
 	call_rcu(&f->rcu, fl_destroy_filter);
 	return 0;
-- 
cgit v1.2.3


From 2b6a321da9a2d8725a1d3dbb0b2e96a7618ebe56 Mon Sep 17 00:00:00 2001
From: Andrew Duggan <aduggan@synaptics.com>
Date: Thu, 10 Mar 2016 15:35:49 -0800
Subject: Input: synaptics-rmi4 - add support for Synaptics RMI4 devices

Synaptics uses the Register Mapped Interface (RMI) protocol as a
communications interface for their devices. This driver adds the core
functionality needed to interface with RMI4 devices.

RMI devices can be connected to the host via several transport protocols
and can supports a wide variety of functionality defined by RMI functions.
Support for transport protocols and RMI functions are implemented in
individual drivers. The RMI4 core driver uses a bus architecture to
facilitate the various combinations of transport and function drivers
needed by a particular device.

Signed-off-by: Andrew Duggan <aduggan@synaptics.com>
Signed-off-by: Christopher Heiny <cheiny@synaptics.com>
Tested-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Tested-by: Linus Walleij <linus.walleij@linaro.org>
Tested-by: Bjorn Andersson <bjorn.andersson@linaro.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/Kconfig           |    2 +
 drivers/input/Makefile          |    2 +
 drivers/input/rmi4/Kconfig      |   10 +
 drivers/input/rmi4/Makefile     |    2 +
 drivers/input/rmi4/rmi_bus.c    |  375 ++++++++++++++
 drivers/input/rmi4/rmi_bus.h    |  186 +++++++
 drivers/input/rmi4/rmi_driver.c | 1027 +++++++++++++++++++++++++++++++++++++++
 drivers/input/rmi4/rmi_driver.h |  103 ++++
 drivers/input/rmi4/rmi_f01.c    |  574 ++++++++++++++++++++++
 include/linux/rmi.h             |  209 ++++++++
 include/uapi/linux/input.h      |    1 +
 11 files changed, 2491 insertions(+)
 create mode 100644 drivers/input/rmi4/Kconfig
 create mode 100644 drivers/input/rmi4/Makefile
 create mode 100644 drivers/input/rmi4/rmi_bus.c
 create mode 100644 drivers/input/rmi4/rmi_bus.h
 create mode 100644 drivers/input/rmi4/rmi_driver.c
 create mode 100644 drivers/input/rmi4/rmi_driver.h
 create mode 100644 drivers/input/rmi4/rmi_f01.c
 create mode 100644 include/linux/rmi.h

(limited to 'include/uapi/linux')

diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig
index a35532ec00e4..6261874c07c9 100644
--- a/drivers/input/Kconfig
+++ b/drivers/input/Kconfig
@@ -201,6 +201,8 @@ source "drivers/input/touchscreen/Kconfig"
 
 source "drivers/input/misc/Kconfig"
 
+source "drivers/input/rmi4/Kconfig"
+
 endif
 
 menu "Hardware I/O ports"
diff --git a/drivers/input/Makefile b/drivers/input/Makefile
index 0c9302ca9954..595820bbabe9 100644
--- a/drivers/input/Makefile
+++ b/drivers/input/Makefile
@@ -26,3 +26,5 @@ obj-$(CONFIG_INPUT_TOUCHSCREEN)	+= touchscreen/
 obj-$(CONFIG_INPUT_MISC)	+= misc/
 
 obj-$(CONFIG_INPUT_APMPOWER)	+= apm-power.o
+
+obj-$(CONFIG_RMI4_CORE)		+= rmi4/
diff --git a/drivers/input/rmi4/Kconfig b/drivers/input/rmi4/Kconfig
new file mode 100644
index 000000000000..5ea60e338093
--- /dev/null
+++ b/drivers/input/rmi4/Kconfig
@@ -0,0 +1,10 @@
+#
+# RMI4 configuration
+#
+config RMI4_CORE
+	tristate "Synaptics RMI4 bus support"
+	help
+	  Say Y here if you want to support the Synaptics RMI4 bus.  This is
+	  required for all RMI4 device support.
+
+	  If unsure, say Y.
diff --git a/drivers/input/rmi4/Makefile b/drivers/input/rmi4/Makefile
new file mode 100644
index 000000000000..12f219779b74
--- /dev/null
+++ b/drivers/input/rmi4/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_RMI4_CORE) += rmi_core.o
+rmi_core-y := rmi_bus.o rmi_driver.o rmi_f01.o
diff --git a/drivers/input/rmi4/rmi_bus.c b/drivers/input/rmi4/rmi_bus.c
new file mode 100644
index 000000000000..0a2bd5a0f2b7
--- /dev/null
+++ b/drivers/input/rmi4/rmi_bus.c
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2011-2016 Synaptics Incorporated
+ * Copyright (c) 2011 Unixphere
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/kconfig.h>
+#include <linux/list.h>
+#include <linux/pm.h>
+#include <linux/rmi.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/of.h>
+#include "rmi_bus.h"
+#include "rmi_driver.h"
+
+static int debug_flags;
+module_param(debug_flags, int, 0644);
+MODULE_PARM_DESC(debug_flags, "control debugging information");
+
+void rmi_dbg(int flags, struct device *dev, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	if (flags & debug_flags) {
+		va_start(args, fmt);
+
+		vaf.fmt = fmt;
+		vaf.va = &args;
+
+		dev_printk(KERN_DEBUG, dev, "%pV", &vaf);
+
+		va_end(args);
+	}
+}
+EXPORT_SYMBOL_GPL(rmi_dbg);
+
+/*
+ * RMI Physical devices
+ *
+ * Physical RMI device consists of several functions serving particular
+ * purpose. For example F11 is a 2D touch sensor while F01 is a generic
+ * function present in every RMI device.
+ */
+
+static void rmi_release_device(struct device *dev)
+{
+	struct rmi_device *rmi_dev = to_rmi_device(dev);
+
+	kfree(rmi_dev);
+}
+
+static struct device_type rmi_device_type = {
+	.name		= "rmi4_sensor",
+	.release	= rmi_release_device,
+};
+
+bool rmi_is_physical_device(struct device *dev)
+{
+	return dev->type == &rmi_device_type;
+}
+
+/**
+ * rmi_register_transport_device - register a transport device connection
+ * on the RMI bus.  Transport drivers provide communication from the devices
+ * on a bus (such as SPI, I2C, and so on) to the RMI4 sensor.
+ *
+ * @xport: the transport device to register
+ */
+int rmi_register_transport_device(struct rmi_transport_dev *xport)
+{
+	static atomic_t transport_device_count = ATOMIC_INIT(0);
+	struct rmi_device *rmi_dev;
+	int error;
+
+	rmi_dev = kzalloc(sizeof(struct rmi_device), GFP_KERNEL);
+	if (!rmi_dev)
+		return -ENOMEM;
+
+	device_initialize(&rmi_dev->dev);
+
+	rmi_dev->xport = xport;
+	rmi_dev->number = atomic_inc_return(&transport_device_count) - 1;
+
+	dev_set_name(&rmi_dev->dev, "rmi4-%02d", rmi_dev->number);
+
+	rmi_dev->dev.bus = &rmi_bus_type;
+	rmi_dev->dev.type = &rmi_device_type;
+
+	xport->rmi_dev = rmi_dev;
+
+	error = device_add(&rmi_dev->dev);
+	if (error)
+		goto err_put_device;
+
+	rmi_dbg(RMI_DEBUG_CORE, xport->dev,
+		"%s: Registered %s as %s.\n", __func__,
+		dev_name(rmi_dev->xport->dev), dev_name(&rmi_dev->dev));
+
+	return 0;
+
+err_put_device:
+	put_device(&rmi_dev->dev);
+	return error;
+}
+EXPORT_SYMBOL_GPL(rmi_register_transport_device);
+
+/**
+ * rmi_unregister_transport_device - unregister a transport device connection
+ * @xport: the transport driver to unregister
+ *
+ */
+void rmi_unregister_transport_device(struct rmi_transport_dev *xport)
+{
+	struct rmi_device *rmi_dev = xport->rmi_dev;
+
+	device_del(&rmi_dev->dev);
+	put_device(&rmi_dev->dev);
+}
+EXPORT_SYMBOL(rmi_unregister_transport_device);
+
+
+/* Function specific stuff */
+
+static void rmi_release_function(struct device *dev)
+{
+	struct rmi_function *fn = to_rmi_function(dev);
+
+	kfree(fn);
+}
+
+static struct device_type rmi_function_type = {
+	.name		= "rmi4_function",
+	.release	= rmi_release_function,
+};
+
+bool rmi_is_function_device(struct device *dev)
+{
+	return dev->type == &rmi_function_type;
+}
+
+static int rmi_function_match(struct device *dev, struct device_driver *drv)
+{
+	struct rmi_function_handler *handler = to_rmi_function_handler(drv);
+	struct rmi_function *fn = to_rmi_function(dev);
+
+	return fn->fd.function_number == handler->func;
+}
+
+static int rmi_function_probe(struct device *dev)
+{
+	struct rmi_function *fn = to_rmi_function(dev);
+	struct rmi_function_handler *handler =
+					to_rmi_function_handler(dev->driver);
+	int error;
+
+	if (handler->probe) {
+		error = handler->probe(fn);
+		return error;
+	}
+
+	return 0;
+}
+
+static int rmi_function_remove(struct device *dev)
+{
+	struct rmi_function *fn = to_rmi_function(dev);
+	struct rmi_function_handler *handler =
+					to_rmi_function_handler(dev->driver);
+
+	if (handler->remove)
+		handler->remove(fn);
+
+	return 0;
+}
+
+int rmi_register_function(struct rmi_function *fn)
+{
+	struct rmi_device *rmi_dev = fn->rmi_dev;
+	int error;
+
+	device_initialize(&fn->dev);
+
+	dev_set_name(&fn->dev, "%s.fn%02x",
+		     dev_name(&rmi_dev->dev), fn->fd.function_number);
+
+	fn->dev.parent = &rmi_dev->dev;
+	fn->dev.type = &rmi_function_type;
+	fn->dev.bus = &rmi_bus_type;
+
+	error = device_add(&fn->dev);
+	if (error) {
+		dev_err(&rmi_dev->dev,
+			"Failed device_register function device %s\n",
+			dev_name(&fn->dev));
+		goto err_put_device;
+	}
+
+	rmi_dbg(RMI_DEBUG_CORE, &rmi_dev->dev, "Registered F%02X.\n",
+			fn->fd.function_number);
+
+	return 0;
+
+err_put_device:
+	put_device(&fn->dev);
+	return error;
+}
+
+void rmi_unregister_function(struct rmi_function *fn)
+{
+	device_del(&fn->dev);
+
+	if (fn->dev.of_node)
+		of_node_put(fn->dev.of_node);
+
+	put_device(&fn->dev);
+}
+
+/**
+ * rmi_register_function_handler - register a handler for an RMI function
+ * @handler: RMI handler that should be registered.
+ * @module: pointer to module that implements the handler
+ * @mod_name: name of the module implementing the handler
+ *
+ * This function performs additional setup of RMI function handler and
+ * registers it with the RMI core so that it can be bound to
+ * RMI function devices.
+ */
+int __rmi_register_function_handler(struct rmi_function_handler *handler,
+				     struct module *owner,
+				     const char *mod_name)
+{
+	struct device_driver *driver = &handler->driver;
+	int error;
+
+	driver->bus = &rmi_bus_type;
+	driver->owner = owner;
+	driver->mod_name = mod_name;
+	driver->probe = rmi_function_probe;
+	driver->remove = rmi_function_remove;
+
+	error = driver_register(&handler->driver);
+	if (error) {
+		pr_err("driver_register() failed for %s, error: %d\n",
+			handler->driver.name, error);
+		return error;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__rmi_register_function_handler);
+
+/**
+ * rmi_unregister_function_handler - unregister given RMI function handler
+ * @handler: RMI handler that should be unregistered.
+ *
+ * This function unregisters given function handler from RMI core which
+ * causes it to be unbound from the function devices.
+ */
+void rmi_unregister_function_handler(struct rmi_function_handler *handler)
+{
+	driver_unregister(&handler->driver);
+}
+EXPORT_SYMBOL_GPL(rmi_unregister_function_handler);
+
+/* Bus specific stuff */
+
+static int rmi_bus_match(struct device *dev, struct device_driver *drv)
+{
+	bool physical = rmi_is_physical_device(dev);
+
+	/* First see if types are not compatible */
+	if (physical != rmi_is_physical_driver(drv))
+		return 0;
+
+	return physical || rmi_function_match(dev, drv);
+}
+
+struct bus_type rmi_bus_type = {
+	.match		= rmi_bus_match,
+	.name		= "rmi4",
+};
+
+static struct rmi_function_handler *fn_handlers[] = {
+	&rmi_f01_handler,
+};
+
+static void __rmi_unregister_function_handlers(int start_idx)
+{
+	int i;
+
+	for (i = start_idx; i >= 0; i--)
+		rmi_unregister_function_handler(fn_handlers[i]);
+}
+
+static void rmi_unregister_function_handlers(void)
+{
+	__rmi_unregister_function_handlers(ARRAY_SIZE(fn_handlers) - 1);
+}
+
+static int rmi_register_function_handlers(void)
+{
+	int ret;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(fn_handlers); i++)	{
+		ret = rmi_register_function_handler(fn_handlers[i]);
+		if (ret) {
+			pr_err("%s: error registering the RMI F%02x handler: %d\n",
+				__func__, fn_handlers[i]->func, ret);
+			goto err_unregister_function_handlers;
+		}
+	}
+
+	return 0;
+
+err_unregister_function_handlers:
+	__rmi_unregister_function_handlers(i - 1);
+	return ret;
+}
+
+static int __init rmi_bus_init(void)
+{
+	int error;
+
+	error = bus_register(&rmi_bus_type);
+	if (error) {
+		pr_err("%s: error registering the RMI bus: %d\n",
+			__func__, error);
+		return error;
+	}
+
+	error = rmi_register_function_handlers();
+	if (error)
+		goto err_unregister_bus;
+
+	error = rmi_register_physical_driver();
+	if (error) {
+		pr_err("%s: error registering the RMI physical driver: %d\n",
+			__func__, error);
+		goto err_unregister_bus;
+	}
+
+	return 0;
+
+err_unregister_bus:
+	bus_unregister(&rmi_bus_type);
+	return error;
+}
+module_init(rmi_bus_init);
+
+static void __exit rmi_bus_exit(void)
+{
+	/*
+	 * We should only ever get here if all drivers are unloaded, so
+	 * all we have to do at this point is unregister ourselves.
+	 */
+
+	rmi_unregister_physical_driver();
+	rmi_unregister_function_handlers();
+	bus_unregister(&rmi_bus_type);
+}
+module_exit(rmi_bus_exit);
+
+MODULE_AUTHOR("Christopher Heiny <cheiny@synaptics.com");
+MODULE_AUTHOR("Andrew Duggan <aduggan@synaptics.com");
+MODULE_DESCRIPTION("RMI bus");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(RMI_DRIVER_VERSION);
diff --git a/drivers/input/rmi4/rmi_bus.h b/drivers/input/rmi4/rmi_bus.h
new file mode 100644
index 000000000000..13b148d44b37
--- /dev/null
+++ b/drivers/input/rmi4/rmi_bus.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2011-2016 Synaptics Incorporated
+ * Copyright (c) 2011 Unixphere
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef _RMI_BUS_H
+#define _RMI_BUS_H
+
+#include <linux/rmi.h>
+
+struct rmi_device;
+
+/**
+ * struct rmi_function - represents the implementation of an RMI4
+ * function for a particular device (basically, a driver for that RMI4 function)
+ *
+ * @fd: The function descriptor of the RMI function
+ * @rmi_dev: Pointer to the RMI device associated with this function container
+ * @dev: The device associated with this particular function.
+ *
+ * @num_of_irqs: The number of irqs needed by this function
+ * @irq_pos: The position in the irq bitfield this function holds
+ * @irq_mask: For convenience, can be used to mask IRQ bits off during ATTN
+ * interrupt handling.
+ *
+ * @node: entry in device's list of functions
+ */
+struct rmi_function {
+	struct rmi_function_descriptor fd;
+	struct rmi_device *rmi_dev;
+	struct device dev;
+	struct list_head node;
+
+	unsigned int num_of_irqs;
+	unsigned int irq_pos;
+	unsigned long irq_mask[];
+};
+
+#define to_rmi_function(d)	container_of(d, struct rmi_function, dev)
+
+bool rmi_is_function_device(struct device *dev);
+
+int __must_check rmi_register_function(struct rmi_function *);
+void rmi_unregister_function(struct rmi_function *);
+
+/**
+ * struct rmi_function_handler - driver routines for a particular RMI function.
+ *
+ * @func: The RMI function number
+ * @reset: Called when a reset of the touch sensor is detected.  The routine
+ * should perform any out-of-the-ordinary reset handling that might be
+ * necessary.  Restoring of touch sensor configuration registers should be
+ * handled in the config() callback, below.
+ * @config: Called when the function container is first initialized, and
+ * after a reset is detected.  This routine should write any necessary
+ * configuration settings to the device.
+ * @attention: Called when the IRQ(s) for the function are set by the touch
+ * sensor.
+ * @suspend: Should perform any required operations to suspend the particular
+ * function.
+ * @resume: Should perform any required operations to resume the particular
+ * function.
+ *
+ * All callbacks are expected to return 0 on success, error code on failure.
+ */
+struct rmi_function_handler {
+	struct device_driver driver;
+
+	u8 func;
+
+	int (*probe)(struct rmi_function *fn);
+	void (*remove)(struct rmi_function *fn);
+	int (*config)(struct rmi_function *fn);
+	int (*reset)(struct rmi_function *fn);
+	int (*attention)(struct rmi_function *fn, unsigned long *irq_bits);
+	int (*suspend)(struct rmi_function *fn);
+	int (*resume)(struct rmi_function *fn);
+};
+
+#define to_rmi_function_handler(d) \
+		container_of(d, struct rmi_function_handler, driver)
+
+int __must_check __rmi_register_function_handler(struct rmi_function_handler *,
+						 struct module *, const char *);
+#define rmi_register_function_handler(handler) \
+	__rmi_register_function_handler(handler, THIS_MODULE, KBUILD_MODNAME)
+
+void rmi_unregister_function_handler(struct rmi_function_handler *);
+
+#define to_rmi_driver(d) \
+	container_of(d, struct rmi_driver, driver)
+
+#define to_rmi_device(d) container_of(d, struct rmi_device, dev)
+
+static inline struct rmi_device_platform_data *
+rmi_get_platform_data(struct rmi_device *d)
+{
+	return &d->xport->pdata;
+}
+
+bool rmi_is_physical_device(struct device *dev);
+
+/**
+ * rmi_read - read a single byte
+ * @d: Pointer to an RMI device
+ * @addr: The address to read from
+ * @buf: The read buffer
+ *
+ * Reads a single byte of data using the underlying transport protocol
+ * into memory pointed by @buf. It returns 0 on success or a negative
+ * error code.
+ */
+static inline int rmi_read(struct rmi_device *d, u16 addr, u8 *buf)
+{
+	return d->xport->ops->read_block(d->xport, addr, buf, 1);
+}
+
+/**
+ * rmi_read_block - read a block of bytes
+ * @d: Pointer to an RMI device
+ * @addr: The start address to read from
+ * @buf: The read buffer
+ * @len: Length of the read buffer
+ *
+ * Reads a block of byte data using the underlying transport protocol
+ * into memory pointed by @buf. It returns 0 on success or a negative
+ * error code.
+ */
+static inline int rmi_read_block(struct rmi_device *d, u16 addr,
+				 void *buf, size_t len)
+{
+	return d->xport->ops->read_block(d->xport, addr, buf, len);
+}
+
+/**
+ * rmi_write - write a single byte
+ * @d: Pointer to an RMI device
+ * @addr: The address to write to
+ * @data: The data to write
+ *
+ * Writes a single byte using the underlying transport protocol. It
+ * returns zero on success or a negative error code.
+ */
+static inline int rmi_write(struct rmi_device *d, u16 addr, u8 data)
+{
+	return d->xport->ops->write_block(d->xport, addr, &data, 1);
+}
+
+/**
+ * rmi_write_block - write a block of bytes
+ * @d: Pointer to an RMI device
+ * @addr: The start address to write to
+ * @buf: The write buffer
+ * @len: Length of the write buffer
+ *
+ * Writes a block of byte data from buf using the underlaying transport
+ * protocol.  It returns the amount of bytes written or a negative error code.
+ */
+static inline int rmi_write_block(struct rmi_device *d, u16 addr,
+				  const void *buf, size_t len)
+{
+	return d->xport->ops->write_block(d->xport, addr, buf, len);
+}
+
+int rmi_for_each_dev(void *data, int (*func)(struct device *dev, void *data));
+
+extern struct bus_type rmi_bus_type;
+
+int rmi_of_property_read_u32(struct device *dev, u32 *result,
+				const char *prop, bool optional);
+int rmi_of_property_read_u16(struct device *dev, u16 *result,
+				const char *prop, bool optional);
+int rmi_of_property_read_u8(struct device *dev, u8 *result,
+				const char *prop, bool optional);
+
+#define RMI_DEBUG_CORE			BIT(0)
+#define RMI_DEBUG_XPORT			BIT(1)
+#define RMI_DEBUG_FN			BIT(2)
+#define RMI_DEBUG_2D_SENSOR		BIT(3)
+
+void rmi_dbg(int flags, struct device *dev, const char *fmt, ...);
+#endif
diff --git a/drivers/input/rmi4/rmi_driver.c b/drivers/input/rmi4/rmi_driver.c
new file mode 100644
index 000000000000..b0f34b57a126
--- /dev/null
+++ b/drivers/input/rmi4/rmi_driver.c
@@ -0,0 +1,1027 @@
+/*
+ * Copyright (c) 2011-2016 Synaptics Incorporated
+ * Copyright (c) 2011 Unixphere
+ *
+ * This driver provides the core support for a single RMI4-based device.
+ *
+ * The RMI4 specification can be found here (URL split for line length):
+ *
+ * http://www.synaptics.com/sites/default/files/
+ *      511-000136-01-Rev-E-RMI4-Interfacing-Guide.pdf
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/bitmap.h>
+#include <linux/delay.h>
+#include <linux/fs.h>
+#include <linux/kconfig.h>
+#include <linux/pm.h>
+#include <linux/slab.h>
+#include <uapi/linux/input.h>
+#include <linux/rmi.h>
+#include "rmi_bus.h"
+#include "rmi_driver.h"
+
+#define HAS_NONSTANDARD_PDT_MASK 0x40
+#define RMI4_MAX_PAGE 0xff
+#define RMI4_PAGE_SIZE 0x100
+#define RMI4_PAGE_MASK 0xFF00
+
+#define RMI_DEVICE_RESET_CMD	0x01
+#define DEFAULT_RESET_DELAY_MS	100
+
+static void rmi_free_function_list(struct rmi_device *rmi_dev)
+{
+	struct rmi_function *fn, *tmp;
+	struct rmi_driver_data *data = dev_get_drvdata(&rmi_dev->dev);
+
+	data->f01_container = NULL;
+
+	/* Doing it in the reverse order so F01 will be removed last */
+	list_for_each_entry_safe_reverse(fn, tmp,
+					 &data->function_list, node) {
+		list_del(&fn->node);
+		rmi_unregister_function(fn);
+	}
+}
+
+static int reset_one_function(struct rmi_function *fn)
+{
+	struct rmi_function_handler *fh;
+	int retval = 0;
+
+	if (!fn || !fn->dev.driver)
+		return 0;
+
+	fh = to_rmi_function_handler(fn->dev.driver);
+	if (fh->reset) {
+		retval = fh->reset(fn);
+		if (retval < 0)
+			dev_err(&fn->dev, "Reset failed with code %d.\n",
+				retval);
+	}
+
+	return retval;
+}
+
+static int configure_one_function(struct rmi_function *fn)
+{
+	struct rmi_function_handler *fh;
+	int retval = 0;
+
+	if (!fn || !fn->dev.driver)
+		return 0;
+
+	fh = to_rmi_function_handler(fn->dev.driver);
+	if (fh->config) {
+		retval = fh->config(fn);
+		if (retval < 0)
+			dev_err(&fn->dev, "Config failed with code %d.\n",
+				retval);
+	}
+
+	return retval;
+}
+
+static int rmi_driver_process_reset_requests(struct rmi_device *rmi_dev)
+{
+	struct rmi_driver_data *data = dev_get_drvdata(&rmi_dev->dev);
+	struct rmi_function *entry;
+	int retval;
+
+	list_for_each_entry(entry, &data->function_list, node) {
+		retval = reset_one_function(entry);
+		if (retval < 0)
+			return retval;
+	}
+
+	return 0;
+}
+
+static int rmi_driver_process_config_requests(struct rmi_device *rmi_dev)
+{
+	struct rmi_driver_data *data = dev_get_drvdata(&rmi_dev->dev);
+	struct rmi_function *entry;
+	int retval;
+
+	list_for_each_entry(entry, &data->function_list, node) {
+		retval = configure_one_function(entry);
+		if (retval < 0)
+			return retval;
+	}
+
+	return 0;
+}
+
+static void process_one_interrupt(struct rmi_driver_data *data,
+				  struct rmi_function *fn)
+{
+	struct rmi_function_handler *fh;
+
+	if (!fn || !fn->dev.driver)
+		return;
+
+	fh = to_rmi_function_handler(fn->dev.driver);
+	if (fn->irq_mask && fh->attention) {
+		bitmap_and(data->fn_irq_bits, data->irq_status, fn->irq_mask,
+				data->irq_count);
+		if (!bitmap_empty(data->fn_irq_bits, data->irq_count))
+			fh->attention(fn, data->fn_irq_bits);
+	}
+}
+
+int rmi_process_interrupt_requests(struct rmi_device *rmi_dev)
+{
+	struct rmi_driver_data *data = dev_get_drvdata(&rmi_dev->dev);
+	struct device *dev = &rmi_dev->dev;
+	struct rmi_function *entry;
+	int error;
+
+	if (!data)
+		return 0;
+
+	if (!rmi_dev->xport->attn_data) {
+		error = rmi_read_block(rmi_dev,
+				data->f01_container->fd.data_base_addr + 1,
+				data->irq_status, data->num_of_irq_regs);
+		if (error < 0) {
+			dev_err(dev, "Failed to read irqs, code=%d\n", error);
+			return error;
+		}
+	}
+
+	mutex_lock(&data->irq_mutex);
+	bitmap_and(data->irq_status, data->irq_status, data->current_irq_mask,
+	       data->irq_count);
+	/*
+	 * At this point, irq_status has all bits that are set in the
+	 * interrupt status register and are enabled.
+	 */
+	mutex_unlock(&data->irq_mutex);
+
+	/*
+	 * It would be nice to be able to use irq_chip to handle these
+	 * nested IRQs.  Unfortunately, most of the current customers for
+	 * this driver are using older kernels (3.0.x) that don't support
+	 * the features required for that.  Once they've shifted to more
+	 * recent kernels (say, 3.3 and higher), this should be switched to
+	 * use irq_chip.
+	 */
+	list_for_each_entry(entry, &data->function_list, node)
+		if (entry->irq_mask)
+			process_one_interrupt(data, entry);
+
+	if (data->input)
+		input_sync(data->input);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rmi_process_interrupt_requests);
+
+static int suspend_one_function(struct rmi_function *fn)
+{
+	struct rmi_function_handler *fh;
+	int retval = 0;
+
+	if (!fn || !fn->dev.driver)
+		return 0;
+
+	fh = to_rmi_function_handler(fn->dev.driver);
+	if (fh->suspend) {
+		retval = fh->suspend(fn);
+		if (retval < 0)
+			dev_err(&fn->dev, "Suspend failed with code %d.\n",
+				retval);
+	}
+
+	return retval;
+}
+
+static int rmi_suspend_functions(struct rmi_device *rmi_dev)
+{
+	struct rmi_driver_data *data = dev_get_drvdata(&rmi_dev->dev);
+	struct rmi_function *entry;
+	int retval;
+
+	list_for_each_entry(entry, &data->function_list, node) {
+		retval = suspend_one_function(entry);
+		if (retval < 0)
+			return retval;
+	}
+
+	return 0;
+}
+
+static int resume_one_function(struct rmi_function *fn)
+{
+	struct rmi_function_handler *fh;
+	int retval = 0;
+
+	if (!fn || !fn->dev.driver)
+		return 0;
+
+	fh = to_rmi_function_handler(fn->dev.driver);
+	if (fh->resume) {
+		retval = fh->resume(fn);
+		if (retval < 0)
+			dev_err(&fn->dev, "Resume failed with code %d.\n",
+				retval);
+	}
+
+	return retval;
+}
+
+static int rmi_resume_functions(struct rmi_device *rmi_dev)
+{
+	struct rmi_driver_data *data = dev_get_drvdata(&rmi_dev->dev);
+	struct rmi_function *entry;
+	int retval;
+
+	list_for_each_entry(entry, &data->function_list, node) {
+		retval = resume_one_function(entry);
+		if (retval < 0)
+			return retval;
+	}
+
+	return 0;
+}
+
+static int enable_sensor(struct rmi_device *rmi_dev)
+{
+	int retval = 0;
+
+	retval = rmi_driver_process_config_requests(rmi_dev);
+	if (retval < 0)
+		return retval;
+
+	return rmi_process_interrupt_requests(rmi_dev);
+}
+
+/**
+ * rmi_driver_set_input_params - set input device id and other data.
+ *
+ * @rmi_dev: Pointer to an RMI device
+ * @input: Pointer to input device
+ *
+ */
+static int rmi_driver_set_input_params(struct rmi_device *rmi_dev,
+				struct input_dev *input)
+{
+	input->name = SYNAPTICS_INPUT_DEVICE_NAME;
+	input->id.vendor  = SYNAPTICS_VENDOR_ID;
+	input->id.bustype = BUS_RMI;
+	return 0;
+}
+
+static void rmi_driver_set_input_name(struct rmi_device *rmi_dev,
+				struct input_dev *input)
+{
+	struct rmi_driver_data *data = dev_get_drvdata(&rmi_dev->dev);
+	char *device_name = rmi_f01_get_product_ID(data->f01_container);
+	char *name;
+
+	name = devm_kasprintf(&rmi_dev->dev, GFP_KERNEL,
+			      "Synaptics %s", device_name);
+	if (!name)
+		return;
+
+	input->name = name;
+}
+
+static int rmi_driver_set_irq_bits(struct rmi_device *rmi_dev,
+				   unsigned long *mask)
+{
+	int error = 0;
+	struct rmi_driver_data *data = dev_get_drvdata(&rmi_dev->dev);
+	struct device *dev = &rmi_dev->dev;
+
+	mutex_lock(&data->irq_mutex);
+	bitmap_or(data->new_irq_mask,
+		  data->current_irq_mask, mask, data->irq_count);
+
+	error = rmi_write_block(rmi_dev,
+			data->f01_container->fd.control_base_addr + 1,
+			data->new_irq_mask, data->num_of_irq_regs);
+	if (error < 0) {
+		dev_err(dev, "%s: Failed to change enabled interrupts!",
+							__func__);
+		goto error_unlock;
+	}
+	bitmap_copy(data->current_irq_mask, data->new_irq_mask,
+		    data->num_of_irq_regs);
+
+error_unlock:
+	mutex_unlock(&data->irq_mutex);
+	return error;
+}
+
+static int rmi_driver_clear_irq_bits(struct rmi_device *rmi_dev,
+				     unsigned long *mask)
+{
+	int error = 0;
+	struct rmi_driver_data *data = dev_get_drvdata(&rmi_dev->dev);
+	struct device *dev = &rmi_dev->dev;
+
+	mutex_lock(&data->irq_mutex);
+	bitmap_andnot(data->new_irq_mask,
+		  data->current_irq_mask, mask, data->irq_count);
+
+	error = rmi_write_block(rmi_dev,
+			data->f01_container->fd.control_base_addr + 1,
+			data->new_irq_mask, data->num_of_irq_regs);
+	if (error < 0) {
+		dev_err(dev, "%s: Failed to change enabled interrupts!",
+							__func__);
+		goto error_unlock;
+	}
+	bitmap_copy(data->current_irq_mask, data->new_irq_mask,
+		    data->num_of_irq_regs);
+
+error_unlock:
+	mutex_unlock(&data->irq_mutex);
+	return error;
+}
+
+static int rmi_driver_reset_handler(struct rmi_device *rmi_dev)
+{
+	struct rmi_driver_data *data = dev_get_drvdata(&rmi_dev->dev);
+	int error;
+
+	/*
+	 * Can get called before the driver is fully ready to deal with
+	 * this situation.
+	 */
+	if (!data || !data->f01_container) {
+		dev_warn(&rmi_dev->dev,
+			 "Not ready to handle reset yet!\n");
+		return 0;
+	}
+
+	error = rmi_read_block(rmi_dev,
+			       data->f01_container->fd.control_base_addr + 1,
+			       data->current_irq_mask, data->num_of_irq_regs);
+	if (error < 0) {
+		dev_err(&rmi_dev->dev, "%s: Failed to read current IRQ mask.\n",
+			__func__);
+		return error;
+	}
+
+	error = rmi_driver_process_reset_requests(rmi_dev);
+	if (error < 0)
+		return error;
+
+	error = rmi_driver_process_config_requests(rmi_dev);
+	if (error < 0)
+		return error;
+
+	return 0;
+}
+
+int rmi_read_pdt_entry(struct rmi_device *rmi_dev, struct pdt_entry *entry,
+			u16 pdt_address)
+{
+	u8 buf[RMI_PDT_ENTRY_SIZE];
+	int error;
+
+	error = rmi_read_block(rmi_dev, pdt_address, buf, RMI_PDT_ENTRY_SIZE);
+	if (error) {
+		dev_err(&rmi_dev->dev, "Read PDT entry at %#06x failed, code: %d.\n",
+				pdt_address, error);
+		return error;
+	}
+
+	entry->page_start = pdt_address & RMI4_PAGE_MASK;
+	entry->query_base_addr = buf[0];
+	entry->command_base_addr = buf[1];
+	entry->control_base_addr = buf[2];
+	entry->data_base_addr = buf[3];
+	entry->interrupt_source_count = buf[4] & RMI_PDT_INT_SOURCE_COUNT_MASK;
+	entry->function_version = (buf[4] & RMI_PDT_FUNCTION_VERSION_MASK) >> 5;
+	entry->function_number = buf[5];
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(rmi_read_pdt_entry);
+
+static void rmi_driver_copy_pdt_to_fd(const struct pdt_entry *pdt,
+				      struct rmi_function_descriptor *fd)
+{
+	fd->query_base_addr = pdt->query_base_addr + pdt->page_start;
+	fd->command_base_addr = pdt->command_base_addr + pdt->page_start;
+	fd->control_base_addr = pdt->control_base_addr + pdt->page_start;
+	fd->data_base_addr = pdt->data_base_addr + pdt->page_start;
+	fd->function_number = pdt->function_number;
+	fd->interrupt_source_count = pdt->interrupt_source_count;
+	fd->function_version = pdt->function_version;
+}
+
+#define RMI_SCAN_CONTINUE	0
+#define RMI_SCAN_DONE		1
+
+static int rmi_scan_pdt_page(struct rmi_device *rmi_dev,
+			     int page,
+			     void *ctx,
+			     int (*callback)(struct rmi_device *rmi_dev,
+					     void *ctx,
+					     const struct pdt_entry *entry))
+{
+	struct rmi_driver_data *data = dev_get_drvdata(&rmi_dev->dev);
+	struct pdt_entry pdt_entry;
+	u16 page_start = RMI4_PAGE_SIZE * page;
+	u16 pdt_start = page_start + PDT_START_SCAN_LOCATION;
+	u16 pdt_end = page_start + PDT_END_SCAN_LOCATION;
+	u16 addr;
+	int error;
+	int retval;
+
+	for (addr = pdt_start; addr >= pdt_end; addr -= RMI_PDT_ENTRY_SIZE) {
+		error = rmi_read_pdt_entry(rmi_dev, &pdt_entry, addr);
+		if (error)
+			return error;
+
+		if (RMI4_END_OF_PDT(pdt_entry.function_number))
+			break;
+
+		retval = callback(rmi_dev, ctx, &pdt_entry);
+		if (retval != RMI_SCAN_CONTINUE)
+			return retval;
+	}
+
+	return (data->f01_bootloader_mode || addr == pdt_start) ?
+					RMI_SCAN_DONE : RMI_SCAN_CONTINUE;
+}
+
+static int rmi_scan_pdt(struct rmi_device *rmi_dev, void *ctx,
+			int (*callback)(struct rmi_device *rmi_dev,
+					void *ctx,
+					const struct pdt_entry *entry))
+{
+	int page;
+	int retval = RMI_SCAN_DONE;
+
+	for (page = 0; page <= RMI4_MAX_PAGE; page++) {
+		retval = rmi_scan_pdt_page(rmi_dev, page, ctx, callback);
+		if (retval != RMI_SCAN_CONTINUE)
+			break;
+	}
+
+	return retval < 0 ? retval : 0;
+}
+
+int rmi_read_register_desc(struct rmi_device *d, u16 addr,
+				struct rmi_register_descriptor *rdesc)
+{
+	int ret;
+	u8 size_presence_reg;
+	u8 buf[35];
+	int presense_offset = 1;
+	u8 *struct_buf;
+	int reg;
+	int offset = 0;
+	int map_offset = 0;
+	int i;
+	int b;
+
+	/*
+	 * The first register of the register descriptor is the size of
+	 * the register descriptor's presense register.
+	 */
+	ret = rmi_read(d, addr, &size_presence_reg);
+	if (ret)
+		return ret;
+	++addr;
+
+	if (size_presence_reg < 0 || size_presence_reg > 35)
+		return -EIO;
+
+	memset(buf, 0, sizeof(buf));
+
+	/*
+	 * The presence register contains the size of the register structure
+	 * and a bitmap which identified which packet registers are present
+	 * for this particular register type (ie query, control, or data).
+	 */
+	ret = rmi_read_block(d, addr, buf, size_presence_reg);
+	if (ret)
+		return ret;
+	++addr;
+
+	if (buf[0] == 0) {
+		presense_offset = 3;
+		rdesc->struct_size = buf[1] | (buf[2] << 8);
+	} else {
+		rdesc->struct_size = buf[0];
+	}
+
+	for (i = presense_offset; i < size_presence_reg; i++) {
+		for (b = 0; b < 8; b++) {
+			if (buf[i] & (0x1 << b))
+				bitmap_set(rdesc->presense_map, map_offset, 1);
+			++map_offset;
+		}
+	}
+
+	rdesc->num_registers = bitmap_weight(rdesc->presense_map,
+						RMI_REG_DESC_PRESENSE_BITS);
+
+	rdesc->registers = devm_kzalloc(&d->dev, rdesc->num_registers *
+				sizeof(struct rmi_register_desc_item),
+				GFP_KERNEL);
+	if (!rdesc->registers)
+		return -ENOMEM;
+
+	/*
+	 * Allocate a temporary buffer to hold the register structure.
+	 * I'm not using devm_kzalloc here since it will not be retained
+	 * after exiting this function
+	 */
+	struct_buf = kzalloc(rdesc->struct_size, GFP_KERNEL);
+	if (!struct_buf)
+		return -ENOMEM;
+
+	/*
+	 * The register structure contains information about every packet
+	 * register of this type. This includes the size of the packet
+	 * register and a bitmap of all subpackets contained in the packet
+	 * register.
+	 */
+	ret = rmi_read_block(d, addr, struct_buf, rdesc->struct_size);
+	if (ret)
+		goto free_struct_buff;
+
+	reg = find_first_bit(rdesc->presense_map, RMI_REG_DESC_PRESENSE_BITS);
+	map_offset = 0;
+	for (i = 0; i < rdesc->num_registers; i++) {
+		struct rmi_register_desc_item *item = &rdesc->registers[i];
+		int reg_size = struct_buf[offset];
+
+		++offset;
+		if (reg_size == 0) {
+			reg_size = struct_buf[offset] |
+					(struct_buf[offset + 1] << 8);
+			offset += 2;
+		}
+
+		if (reg_size == 0) {
+			reg_size = struct_buf[offset] |
+					(struct_buf[offset + 1] << 8) |
+					(struct_buf[offset + 2] << 16) |
+					(struct_buf[offset + 3] << 24);
+			offset += 4;
+		}
+
+		item->reg = reg;
+		item->reg_size = reg_size;
+
+		do {
+			for (b = 0; b < 7; b++) {
+				if (struct_buf[offset] & (0x1 << b))
+					bitmap_set(item->subpacket_map,
+						map_offset, 1);
+				++map_offset;
+			}
+		} while (struct_buf[offset++] & 0x80);
+
+		item->num_subpackets = bitmap_weight(item->subpacket_map,
+						RMI_REG_DESC_SUBPACKET_BITS);
+
+		rmi_dbg(RMI_DEBUG_CORE, &d->dev,
+			"%s: reg: %d reg size: %ld subpackets: %d\n", __func__,
+			item->reg, item->reg_size, item->num_subpackets);
+
+		reg = find_next_bit(rdesc->presense_map,
+				RMI_REG_DESC_PRESENSE_BITS, reg + 1);
+	}
+
+free_struct_buff:
+	kfree(struct_buf);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rmi_read_register_desc);
+
+const struct rmi_register_desc_item *rmi_get_register_desc_item(
+				struct rmi_register_descriptor *rdesc, u16 reg)
+{
+	const struct rmi_register_desc_item *item;
+	int i;
+
+	for (i = 0; i < rdesc->num_registers; i++) {
+		item = &rdesc->registers[i];
+		if (item->reg == reg)
+			return item;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(rmi_get_register_desc_item);
+
+size_t rmi_register_desc_calc_size(struct rmi_register_descriptor *rdesc)
+{
+	const struct rmi_register_desc_item *item;
+	int i;
+	size_t size = 0;
+
+	for (i = 0; i < rdesc->num_registers; i++) {
+		item = &rdesc->registers[i];
+		size += item->reg_size;
+	}
+	return size;
+}
+EXPORT_SYMBOL_GPL(rmi_register_desc_calc_size);
+
+/* Compute the register offset relative to the base address */
+int rmi_register_desc_calc_reg_offset(
+		struct rmi_register_descriptor *rdesc, u16 reg)
+{
+	const struct rmi_register_desc_item *item;
+	int offset = 0;
+	int i;
+
+	for (i = 0; i < rdesc->num_registers; i++) {
+		item = &rdesc->registers[i];
+		if (item->reg == reg)
+			return offset;
+		++offset;
+	}
+	return -1;
+}
+EXPORT_SYMBOL_GPL(rmi_register_desc_calc_reg_offset);
+
+bool rmi_register_desc_has_subpacket(const struct rmi_register_desc_item *item,
+	u8 subpacket)
+{
+	return find_next_bit(item->subpacket_map, RMI_REG_DESC_PRESENSE_BITS,
+				subpacket) == subpacket;
+}
+
+/* Indicates that flash programming is enabled (bootloader mode). */
+#define RMI_F01_STATUS_BOOTLOADER(status)	(!!((status) & 0x40))
+
+/*
+ * Given the PDT entry for F01, read the device status register to determine
+ * if we're stuck in bootloader mode or not.
+ *
+ */
+static int rmi_check_bootloader_mode(struct rmi_device *rmi_dev,
+				     const struct pdt_entry *pdt)
+{
+	int error;
+	u8 device_status;
+
+	error = rmi_read(rmi_dev, pdt->data_base_addr + pdt->page_start,
+			 &device_status);
+	if (error) {
+		dev_err(&rmi_dev->dev,
+			"Failed to read device status: %d.\n", error);
+		return error;
+	}
+
+	return RMI_F01_STATUS_BOOTLOADER(device_status);
+}
+
+static int rmi_count_irqs(struct rmi_device *rmi_dev,
+			 void *ctx, const struct pdt_entry *pdt)
+{
+	struct rmi_driver_data *data = dev_get_drvdata(&rmi_dev->dev);
+	int *irq_count = ctx;
+
+	*irq_count += pdt->interrupt_source_count;
+	if (pdt->function_number == 0x01) {
+		data->f01_bootloader_mode =
+			rmi_check_bootloader_mode(rmi_dev, pdt);
+		if (data->f01_bootloader_mode)
+			dev_warn(&rmi_dev->dev,
+				"WARNING: RMI4 device is in bootloader mode!\n");
+	}
+
+	return RMI_SCAN_CONTINUE;
+}
+
+static int rmi_initial_reset(struct rmi_device *rmi_dev,
+			     void *ctx, const struct pdt_entry *pdt)
+{
+	int error;
+
+	if (pdt->function_number == 0x01) {
+		u16 cmd_addr = pdt->page_start + pdt->command_base_addr;
+		u8 cmd_buf = RMI_DEVICE_RESET_CMD;
+		const struct rmi_device_platform_data *pdata =
+				rmi_get_platform_data(rmi_dev);
+
+		if (rmi_dev->xport->ops->reset) {
+			error = rmi_dev->xport->ops->reset(rmi_dev->xport,
+								cmd_addr);
+			if (error)
+				return error;
+
+			return RMI_SCAN_DONE;
+		}
+
+		error = rmi_write_block(rmi_dev, cmd_addr, &cmd_buf, 1);
+		if (error) {
+			dev_err(&rmi_dev->dev,
+				"Initial reset failed. Code = %d.\n", error);
+			return error;
+		}
+
+		mdelay(pdata->reset_delay_ms ?: DEFAULT_RESET_DELAY_MS);
+
+		return RMI_SCAN_DONE;
+	}
+
+	/* F01 should always be on page 0. If we don't find it there, fail. */
+	return pdt->page_start == 0 ? RMI_SCAN_CONTINUE : -ENODEV;
+}
+
+static int rmi_create_function(struct rmi_device *rmi_dev,
+			       void *ctx, const struct pdt_entry *pdt)
+{
+	struct device *dev = &rmi_dev->dev;
+	struct rmi_driver_data *data = dev_get_drvdata(&rmi_dev->dev);
+	int *current_irq_count = ctx;
+	struct rmi_function *fn;
+	int i;
+	int error;
+
+	rmi_dbg(RMI_DEBUG_CORE, dev, "Initializing F%02X.\n",
+			pdt->function_number);
+
+	fn = kzalloc(sizeof(struct rmi_function) +
+			BITS_TO_LONGS(data->irq_count) * sizeof(unsigned long),
+		     GFP_KERNEL);
+	if (!fn) {
+		dev_err(dev, "Failed to allocate memory for F%02X\n",
+			pdt->function_number);
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&fn->node);
+	rmi_driver_copy_pdt_to_fd(pdt, &fn->fd);
+
+	fn->rmi_dev = rmi_dev;
+
+	fn->num_of_irqs = pdt->interrupt_source_count;
+	fn->irq_pos = *current_irq_count;
+	*current_irq_count += fn->num_of_irqs;
+
+	for (i = 0; i < fn->num_of_irqs; i++)
+		set_bit(fn->irq_pos + i, fn->irq_mask);
+
+	error = rmi_register_function(fn);
+	if (error)
+		goto err_put_fn;
+
+	if (pdt->function_number == 0x01)
+		data->f01_container = fn;
+
+	list_add_tail(&fn->node, &data->function_list);
+
+	return RMI_SCAN_CONTINUE;
+
+err_put_fn:
+	put_device(&fn->dev);
+	return error;
+}
+
+int rmi_driver_suspend(struct rmi_device *rmi_dev)
+{
+	int retval = 0;
+
+	retval = rmi_suspend_functions(rmi_dev);
+	if (retval)
+		dev_warn(&rmi_dev->dev, "Failed to suspend functions: %d\n",
+			retval);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(rmi_driver_suspend);
+
+int rmi_driver_resume(struct rmi_device *rmi_dev)
+{
+	int retval;
+
+	retval = rmi_resume_functions(rmi_dev);
+	if (retval)
+		dev_warn(&rmi_dev->dev, "Failed to suspend functions: %d\n",
+			retval);
+
+	return retval;
+}
+EXPORT_SYMBOL_GPL(rmi_driver_resume);
+
+static int rmi_driver_remove(struct device *dev)
+{
+	struct rmi_device *rmi_dev = to_rmi_device(dev);
+
+	rmi_free_function_list(rmi_dev);
+
+	return 0;
+}
+
+static int rmi_driver_probe(struct device *dev)
+{
+	struct rmi_driver *rmi_driver;
+	struct rmi_driver_data *data;
+	struct rmi_device_platform_data *pdata;
+	struct rmi_device *rmi_dev;
+	size_t size;
+	void *irq_memory;
+	int irq_count;
+	int retval;
+
+	rmi_dbg(RMI_DEBUG_CORE, dev, "%s: Starting probe.\n",
+			__func__);
+
+	if (!rmi_is_physical_device(dev)) {
+		rmi_dbg(RMI_DEBUG_CORE, dev, "Not a physical device.\n");
+		return -ENODEV;
+	}
+
+	rmi_dev = to_rmi_device(dev);
+	rmi_driver = to_rmi_driver(dev->driver);
+	rmi_dev->driver = rmi_driver;
+
+	pdata = rmi_get_platform_data(rmi_dev);
+
+	data = devm_kzalloc(dev, sizeof(struct rmi_driver_data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&data->function_list);
+	data->rmi_dev = rmi_dev;
+	dev_set_drvdata(&rmi_dev->dev, data);
+
+	/*
+	 * Right before a warm boot, the sensor might be in some unusual state,
+	 * such as F54 diagnostics, or F34 bootloader mode after a firmware
+	 * or configuration update.  In order to clear the sensor to a known
+	 * state and/or apply any updates, we issue a initial reset to clear any
+	 * previous settings and force it into normal operation.
+	 *
+	 * We have to do this before actually building the PDT because
+	 * the reflash updates (if any) might cause various registers to move
+	 * around.
+	 *
+	 * For a number of reasons, this initial reset may fail to return
+	 * within the specified time, but we'll still be able to bring up the
+	 * driver normally after that failure.  This occurs most commonly in
+	 * a cold boot situation (where then firmware takes longer to come up
+	 * than from a warm boot) and the reset_delay_ms in the platform data
+	 * has been set too short to accommodate that.  Since the sensor will
+	 * eventually come up and be usable, we don't want to just fail here
+	 * and leave the customer's device unusable.  So we warn them, and
+	 * continue processing.
+	 */
+	retval = rmi_scan_pdt(rmi_dev, NULL, rmi_initial_reset);
+	if (retval < 0)
+		dev_warn(dev, "RMI initial reset failed! Continuing in spite of this.\n");
+
+	retval = rmi_read(rmi_dev, PDT_PROPERTIES_LOCATION, &data->pdt_props);
+	if (retval < 0) {
+		/*
+		 * we'll print out a warning and continue since
+		 * failure to get the PDT properties is not a cause to fail
+		 */
+		dev_warn(dev, "Could not read PDT properties from %#06x (code %d). Assuming 0x00.\n",
+			 PDT_PROPERTIES_LOCATION, retval);
+	}
+
+	/*
+	 * We need to count the IRQs and allocate their storage before scanning
+	 * the PDT and creating the function entries, because adding a new
+	 * function can trigger events that result in the IRQ related storage
+	 * being accessed.
+	 */
+	rmi_dbg(RMI_DEBUG_CORE, dev, "Counting IRQs.\n");
+	irq_count = 0;
+	retval = rmi_scan_pdt(rmi_dev, &irq_count, rmi_count_irqs);
+	if (retval < 0) {
+		dev_err(dev, "IRQ counting failed with code %d.\n", retval);
+		goto err;
+	}
+	data->irq_count = irq_count;
+	data->num_of_irq_regs = (data->irq_count + 7) / 8;
+
+	mutex_init(&data->irq_mutex);
+
+	size = BITS_TO_LONGS(data->irq_count) * sizeof(unsigned long);
+	irq_memory = devm_kzalloc(dev, size * 4, GFP_KERNEL);
+	if (!irq_memory) {
+		dev_err(dev, "Failed to allocate memory for irq masks.\n");
+		goto err;
+	}
+
+	data->irq_status	= irq_memory + size * 0;
+	data->fn_irq_bits	= irq_memory + size * 1;
+	data->current_irq_mask	= irq_memory + size * 2;
+	data->new_irq_mask	= irq_memory + size * 3;
+
+	if (rmi_dev->xport->input) {
+		/*
+		 * The transport driver already has an input device.
+		 * In some cases it is preferable to reuse the transport
+		 * devices input device instead of creating a new one here.
+		 * One example is some HID touchpads report "pass-through"
+		 * button events are not reported by rmi registers.
+		 */
+		data->input = rmi_dev->xport->input;
+	} else {
+		data->input = devm_input_allocate_device(dev);
+		if (!data->input) {
+			dev_err(dev, "%s: Failed to allocate input device.\n",
+				__func__);
+			retval = -ENOMEM;
+			goto err_destroy_functions;
+		}
+		rmi_driver_set_input_params(rmi_dev, data->input);
+		data->input->phys = devm_kasprintf(dev, GFP_KERNEL,
+						"%s/input0", dev_name(dev));
+	}
+
+	irq_count = 0;
+	rmi_dbg(RMI_DEBUG_CORE, dev, "Creating functions.");
+	retval = rmi_scan_pdt(rmi_dev, &irq_count, rmi_create_function);
+	if (retval < 0) {
+		dev_err(dev, "Function creation failed with code %d.\n",
+			retval);
+		goto err_destroy_functions;
+	}
+
+	if (!data->f01_container) {
+		dev_err(dev, "Missing F01 container!\n");
+		retval = -EINVAL;
+		goto err_destroy_functions;
+	}
+
+	retval = rmi_read_block(rmi_dev,
+				data->f01_container->fd.control_base_addr + 1,
+				data->current_irq_mask, data->num_of_irq_regs);
+	if (retval < 0) {
+		dev_err(dev, "%s: Failed to read current IRQ mask.\n",
+			__func__);
+		goto err_destroy_functions;
+	}
+
+	if (data->input) {
+		rmi_driver_set_input_name(rmi_dev, data->input);
+		if (!rmi_dev->xport->input) {
+			if (input_register_device(data->input)) {
+				dev_err(dev, "%s: Failed to register input device.\n",
+					__func__);
+				goto err_destroy_functions;
+			}
+		}
+	}
+
+	if (data->f01_container->dev.driver)
+		/* Driver already bound, so enable ATTN now. */
+		return enable_sensor(rmi_dev);
+
+	return 0;
+
+err_destroy_functions:
+	rmi_free_function_list(rmi_dev);
+err:
+	return retval < 0 ? retval : 0;
+}
+
+static struct rmi_driver rmi_physical_driver = {
+	.driver = {
+		.owner	= THIS_MODULE,
+		.name	= "rmi4_physical",
+		.bus	= &rmi_bus_type,
+		.probe = rmi_driver_probe,
+		.remove = rmi_driver_remove,
+	},
+	.reset_handler = rmi_driver_reset_handler,
+	.clear_irq_bits = rmi_driver_clear_irq_bits,
+	.set_irq_bits = rmi_driver_set_irq_bits,
+	.set_input_params = rmi_driver_set_input_params,
+};
+
+bool rmi_is_physical_driver(struct device_driver *drv)
+{
+	return drv == &rmi_physical_driver.driver;
+}
+
+int __init rmi_register_physical_driver(void)
+{
+	int error;
+
+	error = driver_register(&rmi_physical_driver.driver);
+	if (error) {
+		pr_err("%s: driver register failed, code=%d.\n", __func__,
+		       error);
+		return error;
+	}
+
+	return 0;
+}
+
+void __exit rmi_unregister_physical_driver(void)
+{
+	driver_unregister(&rmi_physical_driver.driver);
+}
diff --git a/drivers/input/rmi4/rmi_driver.h b/drivers/input/rmi4/rmi_driver.h
new file mode 100644
index 000000000000..3ab3f1a8078f
--- /dev/null
+++ b/drivers/input/rmi4/rmi_driver.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2011-2016 Synaptics Incorporated
+ * Copyright (c) 2011 Unixphere
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef _RMI_DRIVER_H
+#define _RMI_DRIVER_H
+
+#include <linux/ctype.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/input.h>
+#include "rmi_bus.h"
+
+#define RMI_DRIVER_VERSION "2.0"
+
+#define SYNAPTICS_INPUT_DEVICE_NAME "Synaptics RMI4 Touch Sensor"
+#define SYNAPTICS_VENDOR_ID 0x06cb
+
+#define GROUP(_attrs) { \
+	.attrs = _attrs,  \
+}
+
+#define PDT_PROPERTIES_LOCATION 0x00EF
+#define BSR_LOCATION 0x00FE
+
+#define RMI_PDT_PROPS_HAS_BSR 0x02
+
+#define NAME_BUFFER_SIZE 256
+
+#define RMI_PDT_ENTRY_SIZE 6
+#define RMI_PDT_FUNCTION_VERSION_MASK   0x60
+#define RMI_PDT_INT_SOURCE_COUNT_MASK   0x07
+
+#define PDT_START_SCAN_LOCATION 0x00e9
+#define PDT_END_SCAN_LOCATION	0x0005
+#define RMI4_END_OF_PDT(id) ((id) == 0x00 || (id) == 0xff)
+
+struct pdt_entry {
+	u16 page_start;
+	u8 query_base_addr;
+	u8 command_base_addr;
+	u8 control_base_addr;
+	u8 data_base_addr;
+	u8 interrupt_source_count;
+	u8 function_version;
+	u8 function_number;
+};
+
+int rmi_read_pdt_entry(struct rmi_device *rmi_dev, struct pdt_entry *entry,
+			u16 pdt_address);
+
+#define RMI_REG_DESC_PRESENSE_BITS	(32 * BITS_PER_BYTE)
+#define RMI_REG_DESC_SUBPACKET_BITS	(37 * BITS_PER_BYTE)
+
+/* describes a single packet register */
+struct rmi_register_desc_item {
+	u16 reg;
+	unsigned long reg_size;
+	u8 num_subpackets;
+	unsigned long subpacket_map[BITS_TO_LONGS(
+				RMI_REG_DESC_SUBPACKET_BITS)];
+};
+
+/*
+ * describes the packet registers for a particular type
+ * (ie query, control, data)
+ */
+struct rmi_register_descriptor {
+	unsigned long struct_size;
+	unsigned long presense_map[BITS_TO_LONGS(RMI_REG_DESC_PRESENSE_BITS)];
+	u8 num_registers;
+	struct rmi_register_desc_item *registers;
+};
+
+int rmi_read_register_desc(struct rmi_device *d, u16 addr,
+				struct rmi_register_descriptor *rdesc);
+const struct rmi_register_desc_item *rmi_get_register_desc_item(
+				struct rmi_register_descriptor *rdesc, u16 reg);
+
+/*
+ * Calculate the total size of all of the registers described in the
+ * descriptor.
+ */
+size_t rmi_register_desc_calc_size(struct rmi_register_descriptor *rdesc);
+int rmi_register_desc_calc_reg_offset(
+			struct rmi_register_descriptor *rdesc, u16 reg);
+bool rmi_register_desc_has_subpacket(const struct rmi_register_desc_item *item,
+			u8 subpacket);
+
+bool rmi_is_physical_driver(struct device_driver *);
+int rmi_register_physical_driver(void);
+void rmi_unregister_physical_driver(void);
+
+char *rmi_f01_get_product_ID(struct rmi_function *fn);
+
+extern struct rmi_function_handler rmi_f01_handler;
+
+#endif
diff --git a/drivers/input/rmi4/rmi_f01.c b/drivers/input/rmi4/rmi_f01.c
new file mode 100644
index 000000000000..e50ecc6699e7
--- /dev/null
+++ b/drivers/input/rmi4/rmi_f01.c
@@ -0,0 +1,574 @@
+/*
+ * Copyright (c) 2011-2016 Synaptics Incorporated
+ * Copyright (c) 2011 Unixphere
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kconfig.h>
+#include <linux/rmi.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/of.h>
+#include "rmi_driver.h"
+
+#define RMI_PRODUCT_ID_LENGTH    10
+#define RMI_PRODUCT_INFO_LENGTH   2
+
+#define RMI_DATE_CODE_LENGTH      3
+
+#define PRODUCT_ID_OFFSET 0x10
+#define PRODUCT_INFO_OFFSET 0x1E
+
+
+/* Force a firmware reset of the sensor */
+#define RMI_F01_CMD_DEVICE_RESET	1
+
+/* Various F01_RMI_QueryX bits */
+
+#define RMI_F01_QRY1_CUSTOM_MAP		BIT(0)
+#define RMI_F01_QRY1_NON_COMPLIANT	BIT(1)
+#define RMI_F01_QRY1_HAS_LTS		BIT(2)
+#define RMI_F01_QRY1_HAS_SENSOR_ID	BIT(3)
+#define RMI_F01_QRY1_HAS_CHARGER_INP	BIT(4)
+#define RMI_F01_QRY1_HAS_ADJ_DOZE	BIT(5)
+#define RMI_F01_QRY1_HAS_ADJ_DOZE_HOFF	BIT(6)
+#define RMI_F01_QRY1_HAS_QUERY42	BIT(7)
+
+#define RMI_F01_QRY5_YEAR_MASK		0x1f
+#define RMI_F01_QRY6_MONTH_MASK		0x0f
+#define RMI_F01_QRY7_DAY_MASK		0x1f
+
+#define RMI_F01_QRY2_PRODINFO_MASK	0x7f
+
+#define RMI_F01_BASIC_QUERY_LEN		21 /* From Query 00 through 20 */
+
+struct f01_basic_properties {
+	u8 manufacturer_id;
+	bool has_lts;
+	bool has_adjustable_doze;
+	bool has_adjustable_doze_holdoff;
+	char dom[11]; /* YYYY/MM/DD + '\0' */
+	u8 product_id[RMI_PRODUCT_ID_LENGTH + 1];
+	u16 productinfo;
+	u32 firmware_id;
+};
+
+/* F01 device status bits */
+
+/* Most recent device status event */
+#define RMI_F01_STATUS_CODE(status)		((status) & 0x0f)
+/* The device has lost its configuration for some reason. */
+#define RMI_F01_STATUS_UNCONFIGURED(status)	(!!((status) & 0x80))
+
+/* Control register bits */
+
+/*
+ * Sleep mode controls power management on the device and affects all
+ * functions of the device.
+ */
+#define RMI_F01_CTRL0_SLEEP_MODE_MASK	0x03
+
+#define RMI_SLEEP_MODE_NORMAL		0x00
+#define RMI_SLEEP_MODE_SENSOR_SLEEP	0x01
+#define RMI_SLEEP_MODE_RESERVED0	0x02
+#define RMI_SLEEP_MODE_RESERVED1	0x03
+
+/*
+ * This bit disables whatever sleep mode may be selected by the sleep_mode
+ * field and forces the device to run at full power without sleeping.
+ */
+#define RMI_F01_CRTL0_NOSLEEP_BIT	BIT(2)
+
+/*
+ * When this bit is set, the touch controller employs a noise-filtering
+ * algorithm designed for use with a connected battery charger.
+ */
+#define RMI_F01_CRTL0_CHARGER_BIT	BIT(5)
+
+/*
+ * Sets the report rate for the device. The effect of this setting is
+ * highly product dependent. Check the spec sheet for your particular
+ * touch sensor.
+ */
+#define RMI_F01_CRTL0_REPORTRATE_BIT	BIT(6)
+
+/*
+ * Written by the host as an indicator that the device has been
+ * successfully configured.
+ */
+#define RMI_F01_CRTL0_CONFIGURED_BIT	BIT(7)
+
+/**
+ * @ctrl0 - see the bit definitions above.
+ * @doze_interval - controls the interval between checks for finger presence
+ * when the touch sensor is in doze mode, in units of 10ms.
+ * @wakeup_threshold - controls the capacitance threshold at which the touch
+ * sensor will decide to wake up from that low power state.
+ * @doze_holdoff - controls how long the touch sensor waits after the last
+ * finger lifts before entering the doze state, in units of 100ms.
+ */
+struct f01_device_control {
+	u8 ctrl0;
+	u8 doze_interval;
+	u8 wakeup_threshold;
+	u8 doze_holdoff;
+};
+
+struct f01_data {
+	struct f01_basic_properties properties;
+	struct f01_device_control device_control;
+
+	u16 doze_interval_addr;
+	u16 wakeup_threshold_addr;
+	u16 doze_holdoff_addr;
+
+	bool suspended;
+	bool old_nosleep;
+
+	unsigned int num_of_irq_regs;
+};
+
+static int rmi_f01_read_properties(struct rmi_device *rmi_dev,
+				   u16 query_base_addr,
+				   struct f01_basic_properties *props)
+{
+	u8 queries[RMI_F01_BASIC_QUERY_LEN];
+	int ret;
+	int query_offset = query_base_addr;
+	bool has_ds4_queries = false;
+	bool has_query42 = false;
+	bool has_sensor_id = false;
+	bool has_package_id_query = false;
+	bool has_build_id_query = false;
+	u16 prod_info_addr;
+	u8 ds4_query_len;
+
+	ret = rmi_read_block(rmi_dev, query_offset,
+			       queries, RMI_F01_BASIC_QUERY_LEN);
+	if (ret) {
+		dev_err(&rmi_dev->dev,
+			"Failed to read device query registers: %d\n", ret);
+		return ret;
+	}
+
+	prod_info_addr = query_offset + 17;
+	query_offset += RMI_F01_BASIC_QUERY_LEN;
+
+	/* Now parse what we got */
+	props->manufacturer_id = queries[0];
+
+	props->has_lts = queries[1] & RMI_F01_QRY1_HAS_LTS;
+	props->has_adjustable_doze =
+			queries[1] & RMI_F01_QRY1_HAS_ADJ_DOZE;
+	props->has_adjustable_doze_holdoff =
+			queries[1] & RMI_F01_QRY1_HAS_ADJ_DOZE_HOFF;
+	has_query42 = queries[1] & RMI_F01_QRY1_HAS_QUERY42;
+	has_sensor_id = queries[1] & RMI_F01_QRY1_HAS_SENSOR_ID;
+
+	snprintf(props->dom, sizeof(props->dom), "20%02d/%02d/%02d",
+		 queries[5] & RMI_F01_QRY5_YEAR_MASK,
+		 queries[6] & RMI_F01_QRY6_MONTH_MASK,
+		 queries[7] & RMI_F01_QRY7_DAY_MASK);
+
+	memcpy(props->product_id, &queries[11],
+		RMI_PRODUCT_ID_LENGTH);
+	props->product_id[RMI_PRODUCT_ID_LENGTH] = '\0';
+
+	props->productinfo =
+			((queries[2] & RMI_F01_QRY2_PRODINFO_MASK) << 7) |
+			(queries[3] & RMI_F01_QRY2_PRODINFO_MASK);
+
+	if (has_sensor_id)
+		query_offset++;
+
+	if (has_query42) {
+		ret = rmi_read(rmi_dev, query_offset, queries);
+		if (ret) {
+			dev_err(&rmi_dev->dev,
+				"Failed to read query 42 register: %d\n", ret);
+			return ret;
+		}
+
+		has_ds4_queries = !!(queries[0] & BIT(0));
+		query_offset++;
+	}
+
+	if (has_ds4_queries) {
+		ret = rmi_read(rmi_dev, query_offset, &ds4_query_len);
+		if (ret) {
+			dev_err(&rmi_dev->dev,
+				"Failed to read DS4 queries length: %d\n", ret);
+			return ret;
+		}
+		query_offset++;
+
+		if (ds4_query_len > 0) {
+			ret = rmi_read(rmi_dev, query_offset, queries);
+			if (ret) {
+				dev_err(&rmi_dev->dev,
+					"Failed to read DS4 queries: %d\n",
+					ret);
+				return ret;
+			}
+
+			has_package_id_query = !!(queries[0] & BIT(0));
+			has_build_id_query = !!(queries[0] & BIT(1));
+		}
+
+		if (has_package_id_query)
+			prod_info_addr++;
+
+		if (has_build_id_query) {
+			ret = rmi_read_block(rmi_dev, prod_info_addr, queries,
+					    3);
+			if (ret) {
+				dev_err(&rmi_dev->dev,
+					"Failed to read product info: %d\n",
+					ret);
+				return ret;
+			}
+
+			props->firmware_id = queries[1] << 8 | queries[0];
+			props->firmware_id += queries[2] * 65536;
+		}
+	}
+
+	return 0;
+}
+
+char *rmi_f01_get_product_ID(struct rmi_function *fn)
+{
+	struct f01_data *f01 = dev_get_drvdata(&fn->dev);
+
+	return f01->properties.product_id;
+}
+
+static int rmi_f01_probe(struct rmi_function *fn)
+{
+	struct rmi_device *rmi_dev = fn->rmi_dev;
+	struct rmi_driver_data *driver_data = dev_get_drvdata(&rmi_dev->dev);
+	struct rmi_device_platform_data *pdata = rmi_get_platform_data(rmi_dev);
+	struct f01_data *f01;
+	int error;
+	u16 ctrl_base_addr = fn->fd.control_base_addr;
+	u8 device_status;
+	u8 temp;
+
+	f01 = devm_kzalloc(&fn->dev, sizeof(struct f01_data), GFP_KERNEL);
+	if (!f01)
+		return -ENOMEM;
+
+	f01->num_of_irq_regs = driver_data->num_of_irq_regs;
+
+	/*
+	 * Set the configured bit and (optionally) other important stuff
+	 * in the device control register.
+	 */
+
+	error = rmi_read(rmi_dev, fn->fd.control_base_addr,
+			 &f01->device_control.ctrl0);
+	if (error) {
+		dev_err(&fn->dev, "Failed to read F01 control: %d\n", error);
+		return error;
+	}
+
+	switch (pdata->power_management.nosleep) {
+	case RMI_F01_NOSLEEP_DEFAULT:
+		break;
+	case RMI_F01_NOSLEEP_OFF:
+		f01->device_control.ctrl0 &= ~RMI_F01_CRTL0_NOSLEEP_BIT;
+		break;
+	case RMI_F01_NOSLEEP_ON:
+		f01->device_control.ctrl0 |= RMI_F01_CRTL0_NOSLEEP_BIT;
+		break;
+	}
+
+	/*
+	 * Sleep mode might be set as a hangover from a system crash or
+	 * reboot without power cycle.  If so, clear it so the sensor
+	 * is certain to function.
+	 */
+	if ((f01->device_control.ctrl0 & RMI_F01_CTRL0_SLEEP_MODE_MASK) !=
+			RMI_SLEEP_MODE_NORMAL) {
+		dev_warn(&fn->dev,
+			 "WARNING: Non-zero sleep mode found. Clearing...\n");
+		f01->device_control.ctrl0 &= ~RMI_F01_CTRL0_SLEEP_MODE_MASK;
+	}
+
+	f01->device_control.ctrl0 |= RMI_F01_CRTL0_CONFIGURED_BIT;
+
+	error = rmi_write(rmi_dev, fn->fd.control_base_addr,
+			  f01->device_control.ctrl0);
+	if (error) {
+		dev_err(&fn->dev, "Failed to write F01 control: %d\n", error);
+		return error;
+	}
+
+	/* Dummy read in order to clear irqs */
+	error = rmi_read(rmi_dev, fn->fd.data_base_addr + 1, &temp);
+	if (error < 0) {
+		dev_err(&fn->dev, "Failed to read Interrupt Status.\n");
+		return error;
+	}
+
+	error = rmi_f01_read_properties(rmi_dev, fn->fd.query_base_addr,
+					&f01->properties);
+	if (error < 0) {
+		dev_err(&fn->dev, "Failed to read F01 properties.\n");
+		return error;
+	}
+
+	dev_info(&fn->dev, "found RMI device, manufacturer: %s, product: %s, fw id: %d\n",
+		 f01->properties.manufacturer_id == 1 ? "Synaptics" : "unknown",
+		 f01->properties.product_id, f01->properties.firmware_id);
+
+	/* Advance to interrupt control registers, then skip over them. */
+	ctrl_base_addr++;
+	ctrl_base_addr += f01->num_of_irq_regs;
+
+	/* read control register */
+	if (f01->properties.has_adjustable_doze) {
+		f01->doze_interval_addr = ctrl_base_addr;
+		ctrl_base_addr++;
+
+		if (pdata->power_management.doze_interval) {
+			f01->device_control.doze_interval =
+				pdata->power_management.doze_interval;
+			error = rmi_write(rmi_dev, f01->doze_interval_addr,
+					  f01->device_control.doze_interval);
+			if (error) {
+				dev_err(&fn->dev,
+					"Failed to configure F01 doze interval register: %d\n",
+					error);
+				return error;
+			}
+		} else {
+			error = rmi_read(rmi_dev, f01->doze_interval_addr,
+					 &f01->device_control.doze_interval);
+			if (error) {
+				dev_err(&fn->dev,
+					"Failed to read F01 doze interval register: %d\n",
+					error);
+				return error;
+			}
+		}
+
+		f01->wakeup_threshold_addr = ctrl_base_addr;
+		ctrl_base_addr++;
+
+		if (pdata->power_management.wakeup_threshold) {
+			f01->device_control.wakeup_threshold =
+				pdata->power_management.wakeup_threshold;
+			error = rmi_write(rmi_dev, f01->wakeup_threshold_addr,
+					  f01->device_control.wakeup_threshold);
+			if (error) {
+				dev_err(&fn->dev,
+					"Failed to configure F01 wakeup threshold register: %d\n",
+					error);
+				return error;
+			}
+		} else {
+			error = rmi_read(rmi_dev, f01->wakeup_threshold_addr,
+					 &f01->device_control.wakeup_threshold);
+			if (error < 0) {
+				dev_err(&fn->dev,
+					"Failed to read F01 wakeup threshold register: %d\n",
+					error);
+				return error;
+			}
+		}
+	}
+
+	if (f01->properties.has_lts)
+		ctrl_base_addr++;
+
+	if (f01->properties.has_adjustable_doze_holdoff) {
+		f01->doze_holdoff_addr = ctrl_base_addr;
+		ctrl_base_addr++;
+
+		if (pdata->power_management.doze_holdoff) {
+			f01->device_control.doze_holdoff =
+				pdata->power_management.doze_holdoff;
+			error = rmi_write(rmi_dev, f01->doze_holdoff_addr,
+					  f01->device_control.doze_holdoff);
+			if (error) {
+				dev_err(&fn->dev,
+					"Failed to configure F01 doze holdoff register: %d\n",
+					error);
+				return error;
+			}
+		} else {
+			error = rmi_read(rmi_dev, f01->doze_holdoff_addr,
+					 &f01->device_control.doze_holdoff);
+			if (error) {
+				dev_err(&fn->dev,
+					"Failed to read F01 doze holdoff register: %d\n",
+					error);
+				return error;
+			}
+		}
+	}
+
+	error = rmi_read(rmi_dev, fn->fd.data_base_addr, &device_status);
+	if (error < 0) {
+		dev_err(&fn->dev,
+			"Failed to read device status: %d\n", error);
+		return error;
+	}
+
+	if (RMI_F01_STATUS_UNCONFIGURED(device_status)) {
+		dev_err(&fn->dev,
+			"Device was reset during configuration process, status: %#02x!\n",
+			RMI_F01_STATUS_CODE(device_status));
+		return -EINVAL;
+	}
+
+	dev_set_drvdata(&fn->dev, f01);
+
+	return 0;
+}
+
+static int rmi_f01_config(struct rmi_function *fn)
+{
+	struct f01_data *f01 = dev_get_drvdata(&fn->dev);
+	int error;
+
+	error = rmi_write(fn->rmi_dev, fn->fd.control_base_addr,
+			  f01->device_control.ctrl0);
+	if (error) {
+		dev_err(&fn->dev,
+			"Failed to write device_control register: %d\n", error);
+		return error;
+	}
+
+	if (f01->properties.has_adjustable_doze) {
+		error = rmi_write(fn->rmi_dev, f01->doze_interval_addr,
+				  f01->device_control.doze_interval);
+		if (error) {
+			dev_err(&fn->dev,
+				"Failed to write doze interval: %d\n", error);
+			return error;
+		}
+
+		error = rmi_write_block(fn->rmi_dev,
+					 f01->wakeup_threshold_addr,
+					 &f01->device_control.wakeup_threshold,
+					 sizeof(u8));
+		if (error) {
+			dev_err(&fn->dev,
+				"Failed to write wakeup threshold: %d\n",
+				error);
+			return error;
+		}
+	}
+
+	if (f01->properties.has_adjustable_doze_holdoff) {
+		error = rmi_write(fn->rmi_dev, f01->doze_holdoff_addr,
+				  f01->device_control.doze_holdoff);
+		if (error) {
+			dev_err(&fn->dev,
+				"Failed to write doze holdoff: %d\n", error);
+			return error;
+		}
+	}
+
+	return 0;
+}
+
+static int rmi_f01_suspend(struct rmi_function *fn)
+{
+	struct f01_data *f01 = dev_get_drvdata(&fn->dev);
+	int error;
+
+	f01->old_nosleep =
+		f01->device_control.ctrl0 & RMI_F01_CRTL0_NOSLEEP_BIT;
+	f01->device_control.ctrl0 &= ~RMI_F01_CRTL0_NOSLEEP_BIT;
+
+	f01->device_control.ctrl0 &= ~RMI_F01_CTRL0_SLEEP_MODE_MASK;
+	if (device_may_wakeup(fn->rmi_dev->xport->dev))
+		f01->device_control.ctrl0 |= RMI_SLEEP_MODE_RESERVED1;
+	else
+		f01->device_control.ctrl0 |= RMI_SLEEP_MODE_SENSOR_SLEEP;
+
+	error = rmi_write(fn->rmi_dev, fn->fd.control_base_addr,
+			  f01->device_control.ctrl0);
+	if (error) {
+		dev_err(&fn->dev, "Failed to write sleep mode: %d.\n", error);
+		if (f01->old_nosleep)
+			f01->device_control.ctrl0 |= RMI_F01_CRTL0_NOSLEEP_BIT;
+		f01->device_control.ctrl0 &= ~RMI_F01_CTRL0_SLEEP_MODE_MASK;
+		f01->device_control.ctrl0 |= RMI_SLEEP_MODE_NORMAL;
+		return error;
+	}
+
+	return 0;
+}
+
+static int rmi_f01_resume(struct rmi_function *fn)
+{
+	struct f01_data *f01 = dev_get_drvdata(&fn->dev);
+	int error;
+
+	if (f01->old_nosleep)
+		f01->device_control.ctrl0 |= RMI_F01_CRTL0_NOSLEEP_BIT;
+
+	f01->device_control.ctrl0 &= ~RMI_F01_CTRL0_SLEEP_MODE_MASK;
+	f01->device_control.ctrl0 |= RMI_SLEEP_MODE_NORMAL;
+
+	error = rmi_write(fn->rmi_dev, fn->fd.control_base_addr,
+			  f01->device_control.ctrl0);
+	if (error) {
+		dev_err(&fn->dev,
+			"Failed to restore normal operation: %d.\n", error);
+		return error;
+	}
+
+	return 0;
+}
+
+static int rmi_f01_attention(struct rmi_function *fn,
+			     unsigned long *irq_bits)
+{
+	struct rmi_device *rmi_dev = fn->rmi_dev;
+	int error;
+	u8 device_status;
+
+	error = rmi_read(rmi_dev, fn->fd.data_base_addr, &device_status);
+	if (error) {
+		dev_err(&fn->dev,
+			"Failed to read device status: %d.\n", error);
+		return error;
+	}
+
+	if (RMI_F01_STATUS_UNCONFIGURED(device_status)) {
+		dev_warn(&fn->dev, "Device reset detected.\n");
+		error = rmi_dev->driver->reset_handler(rmi_dev);
+		if (error) {
+			dev_err(&fn->dev, "Device reset failed: %d\n", error);
+			return error;
+		}
+	}
+
+	return 0;
+}
+
+struct rmi_function_handler rmi_f01_handler = {
+	.driver = {
+		.name	= "rmi4_f01",
+		/*
+		 * Do not allow user unbinding F01 as it is critical
+		 * function.
+		 */
+		.suppress_bind_attrs = true,
+	},
+	.func		= 0x01,
+	.probe		= rmi_f01_probe,
+	.config		= rmi_f01_config,
+	.attention	= rmi_f01_attention,
+	.suspend	= rmi_f01_suspend,
+	.resume		= rmi_f01_resume,
+};
diff --git a/include/linux/rmi.h b/include/linux/rmi.h
new file mode 100644
index 000000000000..c748fa302067
--- /dev/null
+++ b/include/linux/rmi.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2011-2016 Synaptics Incorporated
+ * Copyright (c) 2011 Unixphere
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#ifndef _RMI_H
+#define _RMI_H
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/input.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/types.h>
+
+#define NAME_BUFFER_SIZE 256
+
+/**
+ * struct rmi_f01_power - override default power management settings.
+ *
+ */
+enum rmi_f01_nosleep {
+	RMI_F01_NOSLEEP_DEFAULT = 0,
+	RMI_F01_NOSLEEP_OFF = 1,
+	RMI_F01_NOSLEEP_ON = 2
+};
+
+/**
+ * struct rmi_f01_power_management -When non-zero, these values will be written
+ * to the touch sensor to override the default firmware settigns.  For a
+ * detailed explanation of what each field does, see the corresponding
+ * documention in the RMI4 specification.
+ *
+ * @nosleep - specifies whether the device is permitted to sleep or doze (that
+ * is, enter a temporary low power state) when no fingers are touching the
+ * sensor.
+ * @wakeup_threshold - controls the capacitance threshold at which the touch
+ * sensor will decide to wake up from that low power state.
+ * @doze_holdoff - controls how long the touch sensor waits after the last
+ * finger lifts before entering the doze state, in units of 100ms.
+ * @doze_interval - controls the interval between checks for finger presence
+ * when the touch sensor is in doze mode, in units of 10ms.
+ */
+struct rmi_f01_power_management {
+	enum rmi_f01_nosleep nosleep;
+	u8 wakeup_threshold;
+	u8 doze_holdoff;
+	u8 doze_interval;
+};
+
+/**
+ * struct rmi_device_platform_data - system specific configuration info.
+ *
+ * @reset_delay_ms - after issuing a reset command to the touch sensor, the
+ * driver waits a few milliseconds to give the firmware a chance to
+ * to re-initialize.  You can override the default wait period here.
+ */
+struct rmi_device_platform_data {
+	int reset_delay_ms;
+
+	/* function handler pdata */
+	struct rmi_f01_power_management power_management;
+};
+
+/**
+ * struct rmi_function_descriptor - RMI function base addresses
+ *
+ * @query_base_addr: The RMI Query base address
+ * @command_base_addr: The RMI Command base address
+ * @control_base_addr: The RMI Control base address
+ * @data_base_addr: The RMI Data base address
+ * @interrupt_source_count: The number of irqs this RMI function needs
+ * @function_number: The RMI function number
+ *
+ * This struct is used when iterating the Page Description Table. The addresses
+ * are 16-bit values to include the current page address.
+ *
+ */
+struct rmi_function_descriptor {
+	u16 query_base_addr;
+	u16 command_base_addr;
+	u16 control_base_addr;
+	u16 data_base_addr;
+	u8 interrupt_source_count;
+	u8 function_number;
+	u8 function_version;
+};
+
+struct rmi_device;
+
+/**
+ * struct rmi_transport_dev - represent an RMI transport device
+ *
+ * @dev: Pointer to the communication device, e.g. i2c or spi
+ * @rmi_dev: Pointer to the RMI device
+ * @proto_name: name of the transport protocol (SPI, i2c, etc)
+ * @ops: pointer to transport operations implementation
+ *
+ * The RMI transport device implements the glue between different communication
+ * buses such as I2C and SPI.
+ *
+ */
+struct rmi_transport_dev {
+	struct device *dev;
+	struct rmi_device *rmi_dev;
+
+	const char *proto_name;
+	const struct rmi_transport_ops *ops;
+
+	struct rmi_device_platform_data pdata;
+
+	struct input_dev *input;
+
+	void *attn_data;
+	int attn_size;
+};
+
+/**
+ * struct rmi_transport_ops - defines transport protocol operations.
+ *
+ * @write_block: Writing a block of data to the specified address
+ * @read_block: Read a block of data from the specified address.
+ */
+struct rmi_transport_ops {
+	int (*write_block)(struct rmi_transport_dev *xport, u16 addr,
+			   const void *buf, size_t len);
+	int (*read_block)(struct rmi_transport_dev *xport, u16 addr,
+			  void *buf, size_t len);
+	int (*reset)(struct rmi_transport_dev *xport, u16 reset_addr);
+};
+
+/**
+ * struct rmi_driver - driver for an RMI4 sensor on the RMI bus.
+ *
+ * @driver: Device driver model driver
+ * @reset_handler: Called when a reset is detected.
+ * @clear_irq_bits: Clear the specified bits in the current interrupt mask.
+ * @set_irq_bist: Set the specified bits in the current interrupt mask.
+ * @store_productid: Callback for cache product id from function 01
+ * @data: Private data pointer
+ *
+ */
+struct rmi_driver {
+	struct device_driver driver;
+
+	int (*reset_handler)(struct rmi_device *rmi_dev);
+	int (*clear_irq_bits)(struct rmi_device *rmi_dev, unsigned long *mask);
+	int (*set_irq_bits)(struct rmi_device *rmi_dev, unsigned long *mask);
+	int (*store_productid)(struct rmi_device *rmi_dev);
+	int (*set_input_params)(struct rmi_device *rmi_dev,
+			struct input_dev *input);
+	void *data;
+};
+
+/**
+ * struct rmi_device - represents an RMI4 sensor device on the RMI bus.
+ *
+ * @dev: The device created for the RMI bus
+ * @number: Unique number for the device on the bus.
+ * @driver: Pointer to associated driver
+ * @xport: Pointer to the transport interface
+ *
+ */
+struct rmi_device {
+	struct device dev;
+	int number;
+
+	struct rmi_driver *driver;
+	struct rmi_transport_dev *xport;
+
+};
+
+struct rmi_driver_data {
+	struct list_head function_list;
+
+	struct rmi_device *rmi_dev;
+
+	struct rmi_function *f01_container;
+	bool f01_bootloader_mode;
+
+	u32 attn_count;
+	int num_of_irq_regs;
+	int irq_count;
+	unsigned long *irq_status;
+	unsigned long *fn_irq_bits;
+	unsigned long *current_irq_mask;
+	unsigned long *new_irq_mask;
+	struct mutex irq_mutex;
+	struct input_dev *input;
+
+	u8 pdt_props;
+	u8 bsr;
+
+	bool enabled;
+
+	void *data;
+};
+
+int rmi_register_transport_device(struct rmi_transport_dev *xport);
+void rmi_unregister_transport_device(struct rmi_transport_dev *xport);
+int rmi_process_interrupt_requests(struct rmi_device *rmi_dev);
+
+int rmi_driver_suspend(struct rmi_device *rmi_dev);
+int rmi_driver_resume(struct rmi_device *rmi_dev);
+#endif
diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h
index 2758687300b4..01113841190d 100644
--- a/include/uapi/linux/input.h
+++ b/include/uapi/linux/input.h
@@ -246,6 +246,7 @@ struct input_mask {
 #define BUS_GSC			0x1A
 #define BUS_ATARI		0x1B
 #define BUS_SPI			0x1C
+#define BUS_RMI			0x1D
 
 /*
  * MT_TOOL types
-- 
cgit v1.2.3


From 0308813724606549436d30efd877a80c8e00790e Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 4 Mar 2016 06:24:53 -0500
Subject: vhost_net: basic polling support

This patch tries to poll for new added tx buffer or socket receive
queue for a while at the end of tx/rx processing. The maximum time
spent on polling were specified through a new kind of vring ioctl.

Signed-off-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/vhost/net.c        | 78 +++++++++++++++++++++++++++++++++++++++++++---
 drivers/vhost/vhost.c      | 14 +++++++++
 drivers/vhost/vhost.h      |  1 +
 include/uapi/linux/vhost.h |  6 ++++
 4 files changed, 94 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7bd75ff8be26..f744eeb3e2b4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -287,6 +287,43 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
 	rcu_read_unlock_bh();
 }
 
+static inline unsigned long busy_clock(void)
+{
+	return local_clock() >> 10;
+}
+
+static bool vhost_can_busy_poll(struct vhost_dev *dev,
+				unsigned long endtime)
+{
+	return likely(!need_resched()) &&
+	       likely(!time_after(busy_clock(), endtime)) &&
+	       likely(!signal_pending(current)) &&
+	       !vhost_has_work(dev);
+}
+
+static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
+				    struct vhost_virtqueue *vq,
+				    struct iovec iov[], unsigned int iov_size,
+				    unsigned int *out_num, unsigned int *in_num)
+{
+	unsigned long uninitialized_var(endtime);
+	int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+				    out_num, in_num, NULL, NULL);
+
+	if (r == vq->num && vq->busyloop_timeout) {
+		preempt_disable();
+		endtime = busy_clock() + vq->busyloop_timeout;
+		while (vhost_can_busy_poll(vq->dev, endtime) &&
+		       vhost_vq_avail_empty(vq->dev, vq))
+			cpu_relax_lowlatency();
+		preempt_enable();
+		r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+					out_num, in_num, NULL, NULL);
+	}
+
+	return r;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_tx(struct vhost_net *net)
@@ -331,10 +368,9 @@ static void handle_tx(struct vhost_net *net)
 			      % UIO_MAXIOV == nvq->done_idx))
 			break;
 
-		head = vhost_get_vq_desc(vq, vq->iov,
-					 ARRAY_SIZE(vq->iov),
-					 &out, &in,
-					 NULL, NULL);
+		head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
+						ARRAY_SIZE(vq->iov),
+						&out, &in);
 		/* On error, stop handling until the next kick. */
 		if (unlikely(head < 0))
 			break;
@@ -435,6 +471,38 @@ static int peek_head_len(struct sock *sk)
 	return len;
 }
 
+static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk)
+{
+	struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
+	struct vhost_virtqueue *vq = &nvq->vq;
+	unsigned long uninitialized_var(endtime);
+	int len = peek_head_len(sk);
+
+	if (!len && vq->busyloop_timeout) {
+		/* Both tx vq and rx socket were polled here */
+		mutex_lock(&vq->mutex);
+		vhost_disable_notify(&net->dev, vq);
+
+		preempt_disable();
+		endtime = busy_clock() + vq->busyloop_timeout;
+
+		while (vhost_can_busy_poll(&net->dev, endtime) &&
+		       skb_queue_empty(&sk->sk_receive_queue) &&
+		       vhost_vq_avail_empty(&net->dev, vq))
+			cpu_relax_lowlatency();
+
+		preempt_enable();
+
+		if (vhost_enable_notify(&net->dev, vq))
+			vhost_poll_queue(&vq->poll);
+		mutex_unlock(&vq->mutex);
+
+		len = peek_head_len(sk);
+	}
+
+	return len;
+}
+
 /* This is a multi-buffer version of vhost_get_desc, that works if
  *	vq has read descriptors only.
  * @vq		- the relevant virtqueue
@@ -553,7 +621,7 @@ static void handle_rx(struct vhost_net *net)
 		vq->log : NULL;
 	mergeable = vhost_has_feature(vq, VIRTIO_NET_F_MRG_RXBUF);
 
-	while ((sock_len = peek_head_len(sock->sk))) {
+	while ((sock_len = vhost_net_rx_peek_head_len(net, sock->sk))) {
 		sock_len += sock_hlen;
 		vhost_len = sock_len + vhost_hlen;
 		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 97f26f0aab40..669fef1e2bb6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -303,6 +303,7 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 	vq->memory = NULL;
 	vhost_reset_is_le(vq);
 	vhost_disable_cross_endian(vq);
+	vq->busyloop_timeout = 0;
 }
 
 static int vhost_worker(void *data)
@@ -937,6 +938,19 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
 	case VHOST_GET_VRING_ENDIAN:
 		r = vhost_get_vring_endian(vq, idx, argp);
 		break;
+	case VHOST_SET_VRING_BUSYLOOP_TIMEOUT:
+		if (copy_from_user(&s, argp, sizeof(s))) {
+			r = -EFAULT;
+			break;
+		}
+		vq->busyloop_timeout = s.num;
+		break;
+	case VHOST_GET_VRING_BUSYLOOP_TIMEOUT:
+		s.index = idx;
+		s.num = vq->busyloop_timeout;
+		if (copy_to_user(argp, &s, sizeof(s)))
+			r = -EFAULT;
+		break;
 	default:
 		r = -ENOIOCTLCMD;
 	}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index af5af773bf7a..d36d8beb3351 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -115,6 +115,7 @@ struct vhost_virtqueue {
 	/* Ring endianness requested by userspace for cross-endian support. */
 	bool user_be;
 #endif
+	u32 busyloop_timeout;
 };
 
 struct vhost_dev {
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index ab3731917bac..61a8777178c6 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -126,6 +126,12 @@ struct vhost_memory {
 #define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
 /* Set eventfd to signal an error */
 #define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
+/* Set busy loop timeout (in us) */
+#define VHOST_SET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x23,	\
+					 struct vhost_vring_state)
+/* Get busy loop timeout (in us) */
+#define VHOST_GET_VRING_BUSYLOOP_TIMEOUT _IOW(VHOST_VIRTIO, 0x24,	\
+					 struct vhost_vring_state)
 
 /* VHOST_NET specific defines */
 
-- 
cgit v1.2.3


From 32c76de3466ed2a875e36c140ac4e3800fdfab6e Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@yasker.org>
Date: Mon, 29 Feb 2016 16:02:15 -0800
Subject: target/user: Report capability of handling out-of-order completions
 to userspace

TCMU_MAILBOX_FLAG_CAP_OOOC was introduced, and userspace can check the flag
for out-of-order completion capability support.

Also update the document on how to use the feature.

Signed-off-by: Sheng Yang <sheng@yasker.org>
Reviewed-by: Andy Grover <agrover@redhat.com>
Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
---
 Documentation/target/tcmu-design.txt  | 11 ++++++++++-
 drivers/target/target_core_user.c     |  1 +
 include/uapi/linux/target_core_user.h |  1 +
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/Documentation/target/tcmu-design.txt b/Documentation/target/tcmu-design.txt
index bef81e42788f..4cebc1ebf99a 100644
--- a/Documentation/target/tcmu-design.txt
+++ b/Documentation/target/tcmu-design.txt
@@ -117,7 +117,9 @@ userspace (respectively) to put commands on the ring, and indicate
 when the commands are completed.
 
 version - 1 (userspace should abort if otherwise)
-flags - none yet defined.
+flags:
+- TCMU_MAILBOX_FLAG_CAP_OOOC: indicates out-of-order completion is
+  supported.  See "The Command Ring" for details.
 cmdr_off - The offset of the start of the command ring from the start
 of the memory region, to account for the mailbox size.
 cmdr_size - The size of the command ring. This does *not* need to be a
@@ -162,6 +164,13 @@ rsp.sense_buffer if necessary. Userspace then increments
 mailbox.cmd_tail by entry.hdr.length (mod cmdr_size) and signals the
 kernel via the UIO method, a 4-byte write to the file descriptor.
 
+If TCMU_MAILBOX_FLAG_CAP_OOOC is set for mailbox->flags, kernel is
+capable of handling out-of-order completions. In this case, userspace can
+handle command in different order other than original. Since kernel would
+still process the commands in the same order it appeared in the command
+ring, userspace need to update the cmd->id when completing the
+command(a.k.a steal the original command's entry).
+
 When the opcode is PAD, userspace only updates cmd_tail as above --
 it's a no-op. (The kernel inserts PAD entries to ensure each CMD entry
 is contiguous within the command ring.)
diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index fc4789c6067c..62bf4fe5704a 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -930,6 +930,7 @@ static int tcmu_configure_device(struct se_device *dev)
 
 	mb = udev->mb_addr;
 	mb->version = TCMU_MAILBOX_VERSION;
+	mb->flags = TCMU_MAILBOX_FLAG_CAP_OOOC;
 	mb->cmdr_off = CMDR_OFF;
 	mb->cmdr_size = udev->cmdr_size;
 
diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h
index 95c6521d8a95..c506cddb8165 100644
--- a/include/uapi/linux/target_core_user.h
+++ b/include/uapi/linux/target_core_user.h
@@ -41,6 +41,7 @@
 
 #define TCMU_MAILBOX_VERSION 2
 #define ALIGN_SIZE 64 /* Should be enough for most CPUs */
+#define TCMU_MAILBOX_FLAG_CAP_OOOC (1 << 0) /* Out-of-order completions */
 
 struct tcmu_mailbox {
 	__u16 version;
-- 
cgit v1.2.3


From e7f70af111f086a20800ad2e17f544b2e3e0f375 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Mar 2016 03:00:03 +0100
Subject: vxlan: support setting IPv6 flow label

This work adds support for setting the IPv6 flow label for vxlan per
device and through collect metadata (ip_tunnel_key) frontends. The
vxlan dst cache does not need any special considerations here, for
the cases where caches can be used, the label is static per cache.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c          | 26 +++++++++++++++++++++-----
 include/net/vxlan.h          |  1 +
 include/uapi/linux/if_link.h |  1 +
 3 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 8bdcd5ea8424..8eda76f9e474 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1790,6 +1790,7 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
 #if IS_ENABLED(CONFIG_IPV6)
 static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 					  struct sk_buff *skb, int oif, u8 tos,
+					  __be32 label,
 					  const struct in6_addr *daddr,
 					  struct in6_addr *saddr,
 					  struct dst_cache *dst_cache,
@@ -1813,6 +1814,7 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
 	fl6.flowi6_tos = RT_TOS(tos);
 	fl6.daddr = *daddr;
 	fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr;
+	fl6.flowlabel = label;
 	fl6.flowi6_mark = skb->mark;
 	fl6.flowi6_proto = IPPROTO_UDP;
 
@@ -1888,7 +1890,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	struct vxlan_metadata _md;
 	struct vxlan_metadata *md = &_md;
 	__be16 src_port = 0, dst_port;
-	__be32 vni;
+	__be32 vni, label;
 	__be16 df = 0;
 	__u8 tos, ttl;
 	int err;
@@ -1939,12 +1941,14 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 	if (tos == 1)
 		tos = ip_tunnel_get_dsfield(old_iph, skb);
 
+	label = vxlan->cfg.label;
 	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
 				     vxlan->cfg.port_max, true);
 
 	if (info) {
 		ttl = info->key.ttl;
 		tos = info->key.tos;
+		label = info->key.label;
 		udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
 
 		if (info->options_len)
@@ -2020,7 +2024,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 
 		ndst = vxlan6_get_route(vxlan, skb,
 					rdst ? rdst->remote_ifindex : 0, tos,
-					&dst->sin6.sin6_addr, &saddr,
+					label, &dst->sin6.sin6_addr, &saddr,
 					dst_cache, info);
 		if (IS_ERR(ndst)) {
 			netdev_dbg(dev, "no route to %pI6\n",
@@ -2066,8 +2070,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
 			return;
 		}
 		udp_tunnel6_xmit_skb(ndst, sk, skb, dev,
-				     &saddr, &dst->sin6.sin6_addr, tos, ttl, 0,
-				     src_port, dst_port, !udp_sum);
+				     &saddr, &dst->sin6.sin6_addr, tos, ttl,
+				     label, src_port, dst_port, !udp_sum);
 #endif
 	}
 
@@ -2390,7 +2394,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 		if (!vxlan->vn6_sock)
 			return -EINVAL;
 		ndst = vxlan6_get_route(vxlan, skb, 0, info->key.tos,
-					&info->key.u.ipv6.dst,
+					info->key.label, &info->key.u.ipv6.dst,
 					&info->key.u.ipv6.src, NULL, info);
 		if (IS_ERR(ndst))
 			return PTR_ERR(ndst);
@@ -2505,6 +2509,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
 	[IFLA_VXLAN_LOCAL6]	= { .len = sizeof(struct in6_addr) },
 	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
+	[IFLA_VXLAN_LABEL]	= { .type = NLA_U32 },
 	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
 	[IFLA_VXLAN_AGEING]	= { .type = NLA_U32 },
 	[IFLA_VXLAN_LIMIT]	= { .type = NLA_U32 },
@@ -2739,6 +2744,11 @@ static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
 		vxlan->flags |= VXLAN_F_IPV6;
 	}
 
+	if (conf->label && !use_ipv6) {
+		pr_info("label only supported in use with IPv6\n");
+		return -EINVAL;
+	}
+
 	if (conf->remote_ifindex) {
 		lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
 		dst->remote_ifindex = conf->remote_ifindex;
@@ -2887,6 +2897,10 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev,
 	if (data[IFLA_VXLAN_TTL])
 		conf.ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
 
+	if (data[IFLA_VXLAN_LABEL])
+		conf.label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
+			     IPV6_FLOWLABEL_MASK;
+
 	if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
 		conf.flags |= VXLAN_F_LEARN;
 
@@ -2990,6 +3004,7 @@ static size_t vxlan_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
+		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
 		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
@@ -3053,6 +3068,7 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 
 	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
 	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
+	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
 	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
 			!!(vxlan->flags & VXLAN_F_LEARN)) ||
 	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 6eda4ed4d78b..a763c96ecde4 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -162,6 +162,7 @@ struct vxlan_config {
 	u16			port_max;
 	u8			tos;
 	u8			ttl;
+	__be32			label;
 	u32			flags;
 	unsigned long		age_interval;
 	unsigned int		addrmax;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d452cea59020..6bebc975031d 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -456,6 +456,7 @@ enum {
 	IFLA_VXLAN_GBP,
 	IFLA_VXLAN_REMCSUM_NOPARTIAL,
 	IFLA_VXLAN_COLLECT_METADATA,
+	IFLA_VXLAN_LABEL,
 	__IFLA_VXLAN_MAX
 };
 #define IFLA_VXLAN_MAX	(__IFLA_VXLAN_MAX - 1)
-- 
cgit v1.2.3


From 8eb3b99554b82da968d1fbc00df9f3156c5e2d63 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Mar 2016 03:00:04 +0100
Subject: geneve: support setting IPv6 flow label

This work adds support for setting the IPv6 flow label for geneve per
device and through collect metadata (ip_tunnel_key) frontends. Also here,
the geneve dst cache does not need any special considerations, for the
cases where caches can be used, the label is static per cache.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/geneve.c         | 35 +++++++++++++++++++++++++++--------
 include/uapi/linux/if_link.h |  1 +
 2 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 89ccff79d76c..33185b9a435e 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -68,6 +68,7 @@ struct geneve_dev {
 	u8                 tos;		/* TOS override */
 	union geneve_addr  remote;	/* IP address for link partner */
 	struct list_head   next;	/* geneve's per namespace list */
+	__be32		   label;	/* IPv6 flowlabel override */
 	__be16		   dst_port;
 	bool		   collect_md;
 	struct gro_cells   gro_cells;
@@ -846,6 +847,7 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 		fl6->daddr = info->key.u.ipv6.dst;
 		fl6->saddr = info->key.u.ipv6.src;
 		fl6->flowi6_tos = RT_TOS(info->key.tos);
+		fl6->flowlabel = info->key.label;
 		dst_cache = &info->dst_cache;
 	} else {
 		prio = geneve->tos;
@@ -857,6 +859,7 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
 		}
 
 		fl6->flowi6_tos = RT_TOS(prio);
+		fl6->flowlabel = geneve->label;
 		fl6->daddr = geneve->remote.sin6.sin6_addr;
 		dst_cache = &geneve->dst_cache;
 	}
@@ -998,6 +1001,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 	struct flowi6 fl6;
 	__u8 prio, ttl;
 	__be16 sport;
+	__be32 label;
 	bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
 	u32 flags = geneve->flags;
 
@@ -1041,6 +1045,7 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 
 		prio = ip_tunnel_ecn_encap(key->tos, iip, skb);
 		ttl = key->ttl;
+		label = info->key.label;
 	} else {
 		err = geneve6_build_skb(dst, skb, 0, geneve->vni,
 					0, NULL, flags, xnet);
@@ -1052,9 +1057,11 @@ static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
 		if (!ttl && ipv6_addr_is_multicast(&fl6.daddr))
 			ttl = 1;
 		ttl = ttl ? : ip6_dst_hoplimit(dst);
+		label = geneve->label;
 	}
+
 	udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
-			     &fl6.saddr, &fl6.daddr, prio, ttl, 0,
+			     &fl6.saddr, &fl6.daddr, prio, ttl, label,
 			     sport, geneve->dst_port,
 			     !!(flags & GENEVE_F_UDP_ZERO_CSUM6_TX));
 	return NETDEV_TX_OK;
@@ -1238,6 +1245,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
 	[IFLA_GENEVE_REMOTE6]		= { .len = sizeof(struct in6_addr) },
 	[IFLA_GENEVE_TTL]		= { .type = NLA_U8 },
 	[IFLA_GENEVE_TOS]		= { .type = NLA_U8 },
+	[IFLA_GENEVE_LABEL]		= { .type = NLA_U32 },
 	[IFLA_GENEVE_PORT]		= { .type = NLA_U16 },
 	[IFLA_GENEVE_COLLECT_METADATA]	= { .type = NLA_FLAG },
 	[IFLA_GENEVE_UDP_CSUM]		= { .type = NLA_U8 },
@@ -1295,8 +1303,8 @@ static struct geneve_dev *geneve_find_dev(struct geneve_net *gn,
 
 static int geneve_configure(struct net *net, struct net_device *dev,
 			    union geneve_addr *remote,
-			    __u32 vni, __u8 ttl, __u8 tos, __be16 dst_port,
-			    bool metadata, u32 flags)
+			    __u32 vni, __u8 ttl, __u8 tos, __be32 label,
+			    __be16 dst_port, bool metadata, u32 flags)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_dev *t, *geneve = netdev_priv(dev);
@@ -1306,7 +1314,7 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	if (!remote)
 		return -EINVAL;
 	if (metadata &&
-	    (remote->sa.sa_family != AF_UNSPEC || vni || tos || ttl))
+	    (remote->sa.sa_family != AF_UNSPEC || vni || tos || ttl || label))
 		return -EINVAL;
 
 	geneve->net = net;
@@ -1321,10 +1329,14 @@ static int geneve_configure(struct net *net, struct net_device *dev,
 	    (remote->sa.sa_family == AF_INET6 &&
 	     ipv6_addr_is_multicast(&remote->sin6.sin6_addr)))
 		return -EINVAL;
+	if (label && remote->sa.sa_family != AF_INET6)
+		return -EINVAL;
+
 	geneve->remote = *remote;
 
 	geneve->ttl = ttl;
 	geneve->tos = tos;
+	geneve->label = label;
 	geneve->dst_port = dst_port;
 	geneve->collect_md = metadata;
 	geneve->flags = flags;
@@ -1367,6 +1379,7 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 	__u8 ttl = 0, tos = 0;
 	bool metadata = false;
 	union geneve_addr remote = geneve_remote_unspec;
+	__be32 label = 0;
 	__u32 vni = 0;
 	u32 flags = 0;
 
@@ -1403,6 +1416,10 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 	if (data[IFLA_GENEVE_TOS])
 		tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
 
+	if (data[IFLA_GENEVE_LABEL])
+		label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
+			IPV6_FLOWLABEL_MASK;
+
 	if (data[IFLA_GENEVE_PORT])
 		dst_port = nla_get_be16(data[IFLA_GENEVE_PORT]);
 
@@ -1421,8 +1438,8 @@ static int geneve_newlink(struct net *net, struct net_device *dev,
 	    nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]))
 		flags |= GENEVE_F_UDP_ZERO_CSUM6_RX;
 
-	return geneve_configure(net, dev, &remote, vni, ttl, tos, dst_port,
-				metadata, flags);
+	return geneve_configure(net, dev, &remote, vni, ttl, tos, label,
+				dst_port, metadata, flags);
 }
 
 static void geneve_dellink(struct net_device *dev, struct list_head *head)
@@ -1439,6 +1456,7 @@ static size_t geneve_get_size(const struct net_device *dev)
 		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
+		nla_total_size(sizeof(__be32)) +  /* IFLA_GENEVE_LABEL */
 		nla_total_size(sizeof(__be16)) +  /* IFLA_GENEVE_PORT */
 		nla_total_size(0) +	 /* IFLA_GENEVE_COLLECT_METADATA */
 		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */
@@ -1469,7 +1487,8 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	}
 
 	if (nla_put_u8(skb, IFLA_GENEVE_TTL, geneve->ttl) ||
-	    nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos))
+	    nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos) ||
+	    nla_put_be32(skb, IFLA_GENEVE_LABEL, geneve->label))
 		goto nla_put_failure;
 
 	if (nla_put_be16(skb, IFLA_GENEVE_PORT, geneve->dst_port))
@@ -1521,7 +1540,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 		return dev;
 
 	err = geneve_configure(net, dev, &geneve_remote_unspec,
-			       0, 0, 0, htons(dst_port), true,
+			       0, 0, 0, 0, htons(dst_port), true,
 			       GENEVE_F_UDP_ZERO_CSUM6_RX);
 	if (err)
 		goto err;
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 6bebc975031d..249eef9a21bd 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -479,6 +479,7 @@ enum {
 	IFLA_GENEVE_UDP_CSUM,
 	IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
 	IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
+	IFLA_GENEVE_LABEL,
 	__IFLA_GENEVE_MAX
 };
 #define IFLA_GENEVE_MAX	(__IFLA_GENEVE_MAX - 1)
-- 
cgit v1.2.3


From 4018ab1875e0d00b84ac61bc15427136ad55849e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 9 Mar 2016 03:00:05 +0100
Subject: bpf: support flow label for bpf_skb_{set, get}_tunnel_key

This patch extends bpf_tunnel_key with a tunnel_label member, that maps
to ip_tunnel_key's label so underlying backends like vxlan and geneve
can propagate the label to udp_tunnel6_xmit_skb(), where it's being set
in the IPv6 header. It allows for having 20 more bits to encode/decode
flow related meta information programmatically. Tested with vxlan and
geneve.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h |  1 +
 net/core/filter.c        | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0e30b19012a5..924f537183fd 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -375,6 +375,7 @@ struct bpf_tunnel_key {
 	};
 	__u8 tunnel_tos;
 	__u8 tunnel_ttl;
+	__u32 tunnel_label;
 };
 
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/net/core/filter.c b/net/core/filter.c
index a66dc03c261f..6fc3893a6170 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1770,12 +1770,15 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		return -EPROTO;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
+		case offsetof(struct bpf_tunnel_key, tunnel_label):
+			goto set_compat;
 		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
 			/* Fixup deprecated structure layouts here, so we have
 			 * a common path later on.
 			 */
 			if (ip_tunnel_info_af(info) != AF_INET)
 				return -EINVAL;
+set_compat:
 			to = (struct bpf_tunnel_key *)compat;
 			break;
 		default:
@@ -1787,11 +1790,13 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	to->tunnel_tos = info->key.tos;
 	to->tunnel_ttl = info->key.ttl;
 
-	if (flags & BPF_F_TUNINFO_IPV6)
+	if (flags & BPF_F_TUNINFO_IPV6) {
 		memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
 		       sizeof(to->remote_ipv6));
-	else
+		to->tunnel_label = be32_to_cpu(info->key.label);
+	} else {
 		to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+	}
 
 	if (unlikely(size != sizeof(struct bpf_tunnel_key)))
 		memcpy((void *)(long) r2, to, size);
@@ -1850,6 +1855,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		return -EINVAL;
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
+		case offsetof(struct bpf_tunnel_key, tunnel_label):
 		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
 			/* Fixup deprecated structure layouts here, so we have
 			 * a common path later on.
@@ -1862,6 +1868,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 			return -EINVAL;
 		}
 	}
+	if (unlikely(!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label))
+		return -EINVAL;
 
 	skb_dst_drop(skb);
 	dst_hold((struct dst_entry *) md);
@@ -1882,6 +1890,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 		info->mode |= IP_TUNNEL_INFO_IPV6;
 		memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
 		       sizeof(from->remote_ipv6));
+		info->key.label = cpu_to_be32(from->tunnel_label) &
+				  IPV6_FLOWLABEL_MASK;
 	} else {
 		info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
 		if (flags & BPF_F_ZERO_CSUM_TX)
-- 
cgit v1.2.3


From 136ba622de49a6bf1f6e5eab3391ed5d5dbe30e3 Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Thu, 10 Mar 2016 08:55:50 +0000
Subject: netconf: add macro to represent all attributes

This patch adds macro NETCONFA_ALL to represent all type of netconf
attributes for IPv4 and IPv6.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/netconf.h |  1 +
 net/ipv4/devinet.c           | 40 +++++++++++++++++++++++-----------------
 net/ipv6/addrconf.c          | 36 +++++++++++++++++++++---------------
 3 files changed, 45 insertions(+), 32 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netconf.h b/include/uapi/linux/netconf.h
index 23cbd34e4ac7..45dfad509c4d 100644
--- a/include/uapi/linux/netconf.h
+++ b/include/uapi/linux/netconf.h
@@ -19,6 +19,7 @@ enum {
 	__NETCONFA_MAX
 };
 #define NETCONFA_MAX	(__NETCONFA_MAX - 1)
+#define NETCONFA_ALL	-1
 
 #define NETCONFA_IFINDEX_ALL		-1
 #define NETCONFA_IFINDEX_DEFAULT	-2
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 8c3df2ccba45..65e76a48382c 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1753,17 +1753,20 @@ static int inet_netconf_msgsize_devconf(int type)
 {
 	int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
 		   + nla_total_size(4);	/* NETCONFA_IFINDEX */
+	bool all = false;
 
-	/* type -1 is used for ALL */
-	if (type == -1 || type == NETCONFA_FORWARDING)
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	if (all || type == NETCONFA_FORWARDING)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_RP_FILTER)
+	if (all || type == NETCONFA_RP_FILTER)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_MC_FORWARDING)
+	if (all || type == NETCONFA_MC_FORWARDING)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+	if (all || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
-	if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+	if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
 		size += nla_total_size(4);
 
 	return size;
@@ -1776,36 +1779,39 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 {
 	struct nlmsghdr  *nlh;
 	struct netconfmsg *ncm;
+	bool all = false;
 
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
 			flags);
 	if (!nlh)
 		return -EMSGSIZE;
 
+	if (type == NETCONFA_ALL)
+		all = true;
+
 	ncm = nlmsg_data(nlh);
 	ncm->ncm_family = AF_INET;
 
 	if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
 		goto nla_put_failure;
 
-	/* type -1 is used for ALL */
-	if ((type == -1 || type == NETCONFA_FORWARDING) &&
+	if ((all || type == NETCONFA_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_FORWARDING,
 			IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_RP_FILTER) &&
+	if ((all || type == NETCONFA_RP_FILTER) &&
 	    nla_put_s32(skb, NETCONFA_RP_FILTER,
 			IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+	if ((all || type == NETCONFA_MC_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
 			IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+	if ((all || type == NETCONFA_PROXY_NEIGH) &&
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
 			IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
 		goto nla_put_failure;
-	if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+	if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
 	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 			IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
 		goto nla_put_failure;
@@ -1893,14 +1899,14 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
 	}
 
 	err = -ENOBUFS;
-	skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+	skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
 
 	err = inet_netconf_fill_devconf(skb, ifindex, devconf,
 					NETLINK_CB(in_skb).portid,
 					nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
-					-1);
+					NETCONFA_ALL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
 		WARN_ON(err == -EMSGSIZE);
@@ -1944,7 +1950,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
 						      cb->nlh->nlmsg_seq,
 						      RTM_NEWNETCONF,
 						      NLM_F_MULTI,
-						      -1) < 0) {
+						      NETCONFA_ALL) < 0) {
 				rcu_read_unlock();
 				goto done;
 			}
@@ -1960,7 +1966,7 @@ cont:
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
 					      RTM_NEWNETCONF, NLM_F_MULTI,
-					      -1) < 0)
+					      NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
@@ -1971,7 +1977,7 @@ cont:
 					      NETLINK_CB(cb->skb).portid,
 					      cb->nlh->nlmsg_seq,
 					      RTM_NEWNETCONF, NLM_F_MULTI,
-					      -1) < 0)
+					      NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8c0dab2de5c9..27aed1afcf81 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -473,18 +473,21 @@ static int inet6_netconf_msgsize_devconf(int type)
 {
 	int size =  NLMSG_ALIGN(sizeof(struct netconfmsg))
 		    + nla_total_size(4);	/* NETCONFA_IFINDEX */
+	bool all = false;
 
-	/* type -1 is used for ALL */
-	if (type == -1 || type == NETCONFA_FORWARDING)
+	if (type == NETCONFA_ALL)
+		all = true;
+
+	if (all || type == NETCONFA_FORWARDING)
 		size += nla_total_size(4);
 #ifdef CONFIG_IPV6_MROUTE
-	if (type == -1 || type == NETCONFA_MC_FORWARDING)
+	if (all || type == NETCONFA_MC_FORWARDING)
 		size += nla_total_size(4);
 #endif
-	if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+	if (all || type == NETCONFA_PROXY_NEIGH)
 		size += nla_total_size(4);
 
-	if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+	if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
 		size += nla_total_size(4);
 
 	return size;
@@ -497,33 +500,36 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
 {
 	struct nlmsghdr  *nlh;
 	struct netconfmsg *ncm;
+	bool all = false;
 
 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
 			flags);
 	if (!nlh)
 		return -EMSGSIZE;
 
+	if (type == NETCONFA_ALL)
+		all = true;
+
 	ncm = nlmsg_data(nlh);
 	ncm->ncm_family = AF_INET6;
 
 	if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
 		goto nla_put_failure;
 
-	/* type -1 is used for ALL */
-	if ((type == -1 || type == NETCONFA_FORWARDING) &&
+	if ((all || type == NETCONFA_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0)
 		goto nla_put_failure;
 #ifdef CONFIG_IPV6_MROUTE
-	if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+	if ((all || type == NETCONFA_MC_FORWARDING) &&
 	    nla_put_s32(skb, NETCONFA_MC_FORWARDING,
 			devconf->mc_forwarding) < 0)
 		goto nla_put_failure;
 #endif
-	if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+	if ((all || type == NETCONFA_PROXY_NEIGH) &&
 	    nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0)
 		goto nla_put_failure;
 
-	if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+	if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
 	    nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
 			devconf->ignore_routes_with_linkdown) < 0)
 		goto nla_put_failure;
@@ -609,14 +615,14 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
 	}
 
 	err = -ENOBUFS;
-	skb = nlmsg_new(inet6_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+	skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC);
 	if (!skb)
 		goto errout;
 
 	err = inet6_netconf_fill_devconf(skb, ifindex, devconf,
 					 NETLINK_CB(in_skb).portid,
 					 nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
-					 -1);
+					 NETCONFA_ALL);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */
 		WARN_ON(err == -EMSGSIZE);
@@ -660,7 +666,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb,
 						       cb->nlh->nlmsg_seq,
 						       RTM_NEWNETCONF,
 						       NLM_F_MULTI,
-						       -1) < 0) {
+						       NETCONFA_ALL) < 0) {
 				rcu_read_unlock();
 				goto done;
 			}
@@ -676,7 +682,7 @@ cont:
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq,
 					       RTM_NEWNETCONF, NLM_F_MULTI,
-					       -1) < 0)
+					       NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
@@ -687,7 +693,7 @@ cont:
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq,
 					       RTM_NEWNETCONF, NLM_F_MULTI,
-					       -1) < 0)
+					       NETCONFA_ALL) < 0)
 			goto done;
 		else
 			h++;
-- 
cgit v1.2.3


From dece8d2b78d19df7fe5e4e965f1f0d1a3e188d1b Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri, 11 Mar 2016 18:07:31 +0100
Subject: uapi: add MACsec bits

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/Kbuild      |   1 +
 include/uapi/linux/if_ether.h  |   1 +
 include/uapi/linux/if_link.h   |  29 ++++++++
 include/uapi/linux/if_macsec.h | 161 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 192 insertions(+)
 create mode 100644 include/uapi/linux/if_macsec.h

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index ebd10e624598..e25ebcfbcb48 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -173,6 +173,7 @@ header-y += if_hippi.h
 header-y += if_infiniband.h
 header-y += if_link.h
 header-y += if_ltalk.h
+header-y += if_macsec.h
 header-y += if_packet.h
 header-y += if_phonet.h
 header-y += if_plip.h
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index ea9221b0331a..4a93051c578c 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -83,6 +83,7 @@
 #define ETH_P_8021AD	0x88A8          /* 802.1ad Service VLAN		*/
 #define ETH_P_802_EX1	0x88B5		/* 802.1 Local Experimental 1.  */
 #define ETH_P_TIPC	0x88CA		/* TIPC 			*/
+#define ETH_P_MACSEC	0x88E5		/* 802.1ae MACsec */
 #define ETH_P_8021AH	0x88E7          /* 802.1ah Backbone Service Tag */
 #define ETH_P_MVRP	0x88F5          /* 802.1Q MVRP                  */
 #define ETH_P_1588	0x88F7		/* IEEE 1588 Timesync */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 249eef9a21bd..8e3f88fa5b59 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -413,6 +413,35 @@ enum {
 
 #define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1)
 
+/* MACSEC section */
+enum {
+	IFLA_MACSEC_UNSPEC,
+	IFLA_MACSEC_SCI,
+	IFLA_MACSEC_PORT,
+	IFLA_MACSEC_ICV_LEN,
+	IFLA_MACSEC_CIPHER_SUITE,
+	IFLA_MACSEC_WINDOW,
+	IFLA_MACSEC_ENCODING_SA,
+	IFLA_MACSEC_ENCRYPT,
+	IFLA_MACSEC_PROTECT,
+	IFLA_MACSEC_INC_SCI,
+	IFLA_MACSEC_ES,
+	IFLA_MACSEC_SCB,
+	IFLA_MACSEC_REPLAY_PROTECT,
+	IFLA_MACSEC_VALIDATION,
+	__IFLA_MACSEC_MAX,
+};
+
+#define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1)
+
+enum macsec_validation_type {
+	MACSEC_VALIDATE_DISABLED = 0,
+	MACSEC_VALIDATE_CHECK = 1,
+	MACSEC_VALIDATE_STRICT = 2,
+	__MACSEC_VALIDATE_END,
+	MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1,
+};
+
 /* IPVLAN section */
 enum {
 	IFLA_IPVLAN_UNSPEC,
diff --git a/include/uapi/linux/if_macsec.h b/include/uapi/linux/if_macsec.h
new file mode 100644
index 000000000000..26b0d1e3e3e7
--- /dev/null
+++ b/include/uapi/linux/if_macsec.h
@@ -0,0 +1,161 @@
+/*
+ * include/uapi/linux/if_macsec.h - MACsec device
+ *
+ * Copyright (c) 2015 Sabrina Dubroca <sd@queasysnail.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _UAPI_MACSEC_H
+#define _UAPI_MACSEC_H
+
+#include <linux/types.h>
+
+#define MACSEC_GENL_NAME "macsec"
+#define MACSEC_GENL_VERSION 1
+
+#define MACSEC_MAX_KEY_LEN 128
+
+#define DEFAULT_CIPHER_ID   0x0080020001000001ULL
+#define DEFAULT_CIPHER_ALT  0x0080C20001000001ULL
+
+#define MACSEC_MIN_ICV_LEN 8
+#define MACSEC_MAX_ICV_LEN 32
+
+enum macsec_attrs {
+	MACSEC_ATTR_UNSPEC,
+	MACSEC_ATTR_IFINDEX,     /* u32, ifindex of the MACsec netdevice */
+	MACSEC_ATTR_RXSC_CONFIG, /* config, nested macsec_rxsc_attrs */
+	MACSEC_ATTR_SA_CONFIG,   /* config, nested macsec_sa_attrs */
+	MACSEC_ATTR_SECY,        /* dump, nested macsec_secy_attrs */
+	MACSEC_ATTR_TXSA_LIST,   /* dump, nested, macsec_sa_attrs for each TXSA */
+	MACSEC_ATTR_RXSC_LIST,   /* dump, nested, macsec_rxsc_attrs for each RXSC */
+	MACSEC_ATTR_TXSC_STATS,  /* dump, nested, macsec_txsc_stats_attr */
+	MACSEC_ATTR_SECY_STATS,  /* dump, nested, macsec_secy_stats_attr */
+	__MACSEC_ATTR_END,
+	NUM_MACSEC_ATTR = __MACSEC_ATTR_END,
+	MACSEC_ATTR_MAX = __MACSEC_ATTR_END - 1,
+};
+
+enum macsec_secy_attrs {
+	MACSEC_SECY_ATTR_UNSPEC,
+	MACSEC_SECY_ATTR_SCI,
+	MACSEC_SECY_ATTR_ENCODING_SA,
+	MACSEC_SECY_ATTR_WINDOW,
+	MACSEC_SECY_ATTR_CIPHER_SUITE,
+	MACSEC_SECY_ATTR_ICV_LEN,
+	MACSEC_SECY_ATTR_PROTECT,
+	MACSEC_SECY_ATTR_REPLAY,
+	MACSEC_SECY_ATTR_OPER,
+	MACSEC_SECY_ATTR_VALIDATE,
+	MACSEC_SECY_ATTR_ENCRYPT,
+	MACSEC_SECY_ATTR_INC_SCI,
+	MACSEC_SECY_ATTR_ES,
+	MACSEC_SECY_ATTR_SCB,
+	__MACSEC_SECY_ATTR_END,
+	NUM_MACSEC_SECY_ATTR = __MACSEC_SECY_ATTR_END,
+	MACSEC_SECY_ATTR_MAX = __MACSEC_SECY_ATTR_END - 1,
+};
+
+enum macsec_rxsc_attrs {
+	MACSEC_RXSC_ATTR_UNSPEC,
+	MACSEC_RXSC_ATTR_SCI,     /* config/dump, u64 */
+	MACSEC_RXSC_ATTR_ACTIVE,  /* config/dump, u8 0..1 */
+	MACSEC_RXSC_ATTR_SA_LIST, /* dump, nested */
+	MACSEC_RXSC_ATTR_STATS,   /* dump, nested, macsec_rxsc_stats_attr */
+	__MACSEC_RXSC_ATTR_END,
+	NUM_MACSEC_RXSC_ATTR = __MACSEC_RXSC_ATTR_END,
+	MACSEC_RXSC_ATTR_MAX = __MACSEC_RXSC_ATTR_END - 1,
+};
+
+enum macsec_sa_attrs {
+	MACSEC_SA_ATTR_UNSPEC,
+	MACSEC_SA_ATTR_AN,     /* config/dump, u8 0..3 */
+	MACSEC_SA_ATTR_ACTIVE, /* config/dump, u8 0..1 */
+	MACSEC_SA_ATTR_PN,     /* config/dump, u32 */
+	MACSEC_SA_ATTR_KEY,    /* config, data */
+	MACSEC_SA_ATTR_KEYID,  /* config/dump, u64 */
+	MACSEC_SA_ATTR_STATS,  /* dump, nested, macsec_sa_stats_attr */
+	__MACSEC_SA_ATTR_END,
+	NUM_MACSEC_SA_ATTR = __MACSEC_SA_ATTR_END,
+	MACSEC_SA_ATTR_MAX = __MACSEC_SA_ATTR_END - 1,
+};
+
+enum macsec_nl_commands {
+	MACSEC_CMD_GET_TXSC,
+	MACSEC_CMD_ADD_RXSC,
+	MACSEC_CMD_DEL_RXSC,
+	MACSEC_CMD_UPD_RXSC,
+	MACSEC_CMD_ADD_TXSA,
+	MACSEC_CMD_DEL_TXSA,
+	MACSEC_CMD_UPD_TXSA,
+	MACSEC_CMD_ADD_RXSA,
+	MACSEC_CMD_DEL_RXSA,
+	MACSEC_CMD_UPD_RXSA,
+};
+
+/* u64 per-RXSC stats */
+enum macsec_rxsc_stats_attr {
+	MACSEC_RXSC_STATS_ATTR_UNSPEC,
+	MACSEC_RXSC_STATS_ATTR_IN_OCTETS_VALIDATED,
+	MACSEC_RXSC_STATS_ATTR_IN_OCTETS_DECRYPTED,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNCHECKED,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_DELAYED,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_OK,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_INVALID,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_LATE,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_VALID,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_USING_SA,
+	MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNUSED_SA,
+	__MACSEC_RXSC_STATS_ATTR_END,
+	NUM_MACSEC_RXSC_STATS_ATTR = __MACSEC_RXSC_STATS_ATTR_END,
+	MACSEC_RXSC_STATS_ATTR_MAX = __MACSEC_RXSC_STATS_ATTR_END - 1,
+};
+
+/* u32 per-{RX,TX}SA stats */
+enum macsec_sa_stats_attr {
+	MACSEC_SA_STATS_ATTR_UNSPEC,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_OK,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_INVALID,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_VALID,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_USING_SA,
+	MACSEC_SA_STATS_ATTR_IN_PKTS_UNUSED_SA,
+	MACSEC_SA_STATS_ATTR_OUT_PKTS_PROTECTED,
+	MACSEC_SA_STATS_ATTR_OUT_PKTS_ENCRYPTED,
+	__MACSEC_SA_STATS_ATTR_END,
+	NUM_MACSEC_SA_STATS_ATTR = __MACSEC_SA_STATS_ATTR_END,
+	MACSEC_SA_STATS_ATTR_MAX = __MACSEC_SA_STATS_ATTR_END - 1,
+};
+
+/* u64 per-TXSC stats */
+enum macsec_txsc_stats_attr {
+	MACSEC_TXSC_STATS_ATTR_UNSPEC,
+	MACSEC_TXSC_STATS_ATTR_OUT_PKTS_PROTECTED,
+	MACSEC_TXSC_STATS_ATTR_OUT_PKTS_ENCRYPTED,
+	MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_PROTECTED,
+	MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_ENCRYPTED,
+	__MACSEC_TXSC_STATS_ATTR_END,
+	NUM_MACSEC_TXSC_STATS_ATTR = __MACSEC_TXSC_STATS_ATTR_END,
+	MACSEC_TXSC_STATS_ATTR_MAX = __MACSEC_TXSC_STATS_ATTR_END - 1,
+};
+
+/* u64 per-SecY stats */
+enum macsec_secy_stats_attr {
+	MACSEC_SECY_STATS_ATTR_UNSPEC,
+	MACSEC_SECY_STATS_ATTR_OUT_PKTS_UNTAGGED,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_UNTAGGED,
+	MACSEC_SECY_STATS_ATTR_OUT_PKTS_TOO_LONG,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_TAG,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_BAD_TAG,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_UNKNOWN_SCI,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_SCI,
+	MACSEC_SECY_STATS_ATTR_IN_PKTS_OVERRUN,
+	__MACSEC_SECY_STATS_ATTR_END,
+	NUM_MACSEC_SECY_STATS_ATTR = __MACSEC_SECY_STATS_ATTR_END,
+	MACSEC_SECY_STATS_ATTR_MAX = __MACSEC_SECY_STATS_ATTR_END - 1,
+};
+
+#endif /* _UAPI_MACSEC_H */
-- 
cgit v1.2.3


From a44d6eacdaf56f74fad699af7f4925a5f5ac0e7f Mon Sep 17 00:00:00 2001
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 14 Mar 2016 10:52:15 -0700
Subject: tcp: Add RFC4898 tcpEStatsPerfDataSegsOut/In

Per RFC4898, they count segments sent/received
containing a positive length data segment (that includes
retransmission segments carrying data).  Unlike
tcpi_segs_out/in, tcpi_data_segs_out/in excludes segments
carrying no data (e.g. pure ack).

The patch also updates the segs_in in tcp_fastopen_add_skb()
so that segs_in >= data_segs_in property is kept.

Together with retransmission data, tcpi_data_segs_out
gives a better signal on the rxmit rate.

v6: Rebase on the latest net-next

v5: Eric pointed out that checking skb->len is still needed in
tcp_fastopen_add_skb() because skb can carry a FIN without data.
Hence, instead of open coding segs_in and data_segs_in, tcp_segs_in()
helper is used.  Comment is added to the fastopen case to explain why
segs_in has to be reset and tcp_segs_in() has to be called before
__skb_pull().

v4: Add comment to the changes in tcp_fastopen_add_skb()
and also add remark on this case in the commit message.

v3: Add const modifier to the skb parameter in tcp_segs_in()

v2: Rework based on recent fix by Eric:
commit a9d99ce28ed3 ("tcp: fix tcpi_segs_in after connection establishment")

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Cc: Chris Rapier <rapier@psc.edu>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Marcelo Ricardo Leitner <mleitner@redhat.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h      |  6 ++++++
 include/net/tcp.h        | 10 ++++++++++
 include/uapi/linux/tcp.h |  2 ++
 net/ipv4/tcp.c           |  2 ++
 net/ipv4/tcp_fastopen.c  |  8 ++++++++
 net/ipv4/tcp_ipv4.c      |  2 +-
 net/ipv4/tcp_minisocks.c |  2 +-
 net/ipv4/tcp_output.c    |  4 +++-
 net/ipv6/tcp_ipv6.c      |  2 +-
 9 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index bcbf51da4e1e..7be9b1242354 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -158,6 +158,9 @@ struct tcp_sock {
 	u32	segs_in;	/* RFC4898 tcpEStatsPerfSegsIn
 				 * total number of segments in.
 				 */
+	u32	data_segs_in;	/* RFC4898 tcpEStatsPerfDataSegsIn
+				 * total number of data segments in.
+				 */
  	u32	rcv_nxt;	/* What we want to receive next 	*/
 	u32	copied_seq;	/* Head of yet unread data		*/
 	u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
@@ -165,6 +168,9 @@ struct tcp_sock {
 	u32	segs_out;	/* RFC4898 tcpEStatsPerfSegsOut
 				 * The total number of segments sent.
 				 */
+	u32	data_segs_out;	/* RFC4898 tcpEStatsPerfDataSegsOut
+				 * total number of data segments sent.
+				 */
 	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0302636af98c..c8dbd293daae 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1840,4 +1840,14 @@ static inline int tcp_inq(struct sock *sk)
 	return answ;
 }
 
+static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb)
+{
+	u16 segs_in;
+
+	segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tp->segs_in += segs_in;
+	if (skb->len > tcp_hdrlen(skb))
+		tp->data_segs_in += segs_in;
+}
+
 #endif	/* _TCP_H */
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index fe95446e9abf..53e8e3fe6b1b 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -199,6 +199,8 @@ struct tcp_info {
 
 	__u32	tcpi_notsent_bytes;
 	__u32	tcpi_min_rtt;
+	__u32	tcpi_data_segs_in;	/* RFC4898 tcpEStatsDataSegsIn */
+	__u32	tcpi_data_segs_out;	/* RFC4898 tcpEStatsDataSegsOut */
 };
 
 /* for TCP_MD5SIG socket option */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index a265f00b9df9..992b3103ec3e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2715,6 +2715,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	info->tcpi_notsent_bytes = max(0, notsent_bytes);
 
 	info->tcpi_min_rtt = tcp_min_rtt(tp);
+	info->tcpi_data_segs_in = tp->data_segs_in;
+	info->tcpi_data_segs_out = tp->data_segs_out;
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index fdb286ddba04..4fc0061bebf4 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -140,6 +140,14 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
 		return;
 
 	skb_dst_drop(skb);
+	/* segs_in has been initialized to 1 in tcp_create_openreq_child().
+	 * Hence, reset segs_in to 0 before calling tcp_segs_in()
+	 * to avoid double counting.  Also, tcp_segs_in() expects
+	 * skb->len to include the tcp_hdrlen.  Hence, it should
+	 * be called before __skb_pull().
+	 */
+	tp->segs_in = 0;
+	tcp_segs_in(tp, skb);
 	__skb_pull(skb, tcp_hdrlen(skb));
 	skb_set_owner_r(skb, sk);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4c8d58dfac9b..0b02ef773705 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1650,7 +1650,7 @@ process:
 	sk_incoming_cpu_update(sk);
 
 	bh_lock_sock_nested(sk);
-	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
 		if (!tcp_prequeue(sk, skb))
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index ae90e4b34bd3..acb366dd61e6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -812,7 +812,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
 	int ret = 0;
 	int state = child->sk_state;
 
-	tcp_sk(child)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tcp_segs_in(tcp_sk(child), skb);
 	if (!sock_owned_by_user(child)) {
 		ret = tcp_rcv_state_process(child, skb);
 		/* Wakeup parent, send SIGIO */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7d2c7a400456..7d2dc015cd19 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1003,8 +1003,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	if (likely(tcb->tcp_flags & TCPHDR_ACK))
 		tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
 
-	if (skb->len != tcp_header_size)
+	if (skb->len != tcp_header_size) {
 		tcp_event_data_sent(tp, sk);
+		tp->data_segs_out += tcp_skb_pcount(skb);
+	}
 
 	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
 		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 33f2820181f9..9c16565b70cc 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1443,7 +1443,7 @@ process:
 	sk_incoming_cpu_update(sk);
 
 	bh_lock_sock_nested(sk);
-	tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+	tcp_segs_in(tcp_sk(sk), skb);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
 		if (!tcp_prequeue(sk, skb))
-- 
cgit v1.2.3


From bfa3f9d7f3b349acea8982d2248e33a0ed84c687 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:16 -0800
Subject: netfilter: Remove IP_CT_NEW_REPLY definition.

Remove the definition of IP_CT_NEW_REPLY from the kernel as it does
not make sense.  This allows the definition of IP_CT_NUMBER to be
simplified as well.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_conntrack_common.h | 12 +++++++++---
 net/openvswitch/conntrack.c                        |  2 --
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h
index 319f47128db8..6d074d14ee27 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_common.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_common.h
@@ -20,9 +20,15 @@ enum ip_conntrack_info {
 
 	IP_CT_ESTABLISHED_REPLY = IP_CT_ESTABLISHED + IP_CT_IS_REPLY,
 	IP_CT_RELATED_REPLY = IP_CT_RELATED + IP_CT_IS_REPLY,
-	IP_CT_NEW_REPLY = IP_CT_NEW + IP_CT_IS_REPLY,	
-	/* Number of distinct IP_CT types (no NEW in reply dirn). */
-	IP_CT_NUMBER = IP_CT_IS_REPLY * 2 - 1
+	/* No NEW in reply direction. */
+
+	/* Number of distinct IP_CT types. */
+	IP_CT_NUMBER,
+
+	/* only for userspace compatibility */
+#ifndef __KERNEL__
+	IP_CT_NEW_REPLY = IP_CT_NUMBER,
+#endif
 };
 
 #define NF_CT_STATE_INVALID_BIT			(1 << 0)
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index ee6ff8ffc12d..304529015744 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -75,7 +75,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
 	switch (ctinfo) {
 	case IP_CT_ESTABLISHED_REPLY:
 	case IP_CT_RELATED_REPLY:
-	case IP_CT_NEW_REPLY:
 		ct_state |= OVS_CS_F_REPLY_DIR;
 		break;
 	default:
@@ -92,7 +91,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
 		ct_state |= OVS_CS_F_RELATED;
 		break;
 	case IP_CT_NEW:
-	case IP_CT_NEW_REPLY:
 		ct_state |= OVS_CS_F_NEW;
 		break;
 	default:
-- 
cgit v1.2.3


From 05752523e56502cd9975aec0a2ded465d51a71f3 Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno@ovn.org>
Date: Thu, 10 Mar 2016 10:54:23 -0800
Subject: openvswitch: Interface with NAT.

Extend OVS conntrack interface to cover NAT.  New nested
OVS_CT_ATTR_NAT attribute may be used to include NAT with a CT action.
A bare OVS_CT_ATTR_NAT only mangles existing and expected connections.
If OVS_NAT_ATTR_SRC or OVS_NAT_ATTR_DST is included within the nested
attributes, new (non-committed/non-confirmed) connections are mangled
according to the rest of the nested attributes.

The corresponding OVS userspace patch series includes test cases (in
tests/system-traffic.at) that also serve as example uses.

This work extends on a branch by Thomas Graf at
https://github.com/tgraf/ovs/tree/nat.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
Acked-by: Thomas Graf <tgraf@suug.ch>
Acked-by: Joe Stringer <joe@ovn.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/openvswitch.h |  49 ++++
 net/openvswitch/Kconfig          |   3 +-
 net/openvswitch/conntrack.c      | 524 +++++++++++++++++++++++++++++++++++++--
 net/openvswitch/conntrack.h      |   3 +-
 4 files changed, 551 insertions(+), 28 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
index a27222d5b413..616d04761730 100644
--- a/include/uapi/linux/openvswitch.h
+++ b/include/uapi/linux/openvswitch.h
@@ -454,6 +454,14 @@ struct ovs_key_ct_labels {
 #define OVS_CS_F_REPLY_DIR         0x08 /* Flow is in the reply direction. */
 #define OVS_CS_F_INVALID           0x10 /* Could not track connection. */
 #define OVS_CS_F_TRACKED           0x20 /* Conntrack has occurred. */
+#define OVS_CS_F_SRC_NAT           0x40 /* Packet's source address/port was
+					 * mangled by NAT.
+					 */
+#define OVS_CS_F_DST_NAT           0x80 /* Packet's destination address/port
+					 * was mangled by NAT.
+					 */
+
+#define OVS_CS_F_NAT_MASK (OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
 
 /**
  * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
@@ -632,6 +640,8 @@ struct ovs_action_hash {
  * mask. For each bit set in the mask, the corresponding bit in the value is
  * copied to the connection tracking label field in the connection.
  * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG.
+ * @OVS_CT_ATTR_NAT: Nested OVS_NAT_ATTR_* for performing L3 network address
+ * translation (NAT) on the packet.
  */
 enum ovs_ct_attr {
 	OVS_CT_ATTR_UNSPEC,
@@ -641,11 +651,50 @@ enum ovs_ct_attr {
 	OVS_CT_ATTR_LABELS,     /* labels to associate with this connection. */
 	OVS_CT_ATTR_HELPER,     /* netlink helper to assist detection of
 				   related connections. */
+	OVS_CT_ATTR_NAT,        /* Nested OVS_NAT_ATTR_* */
 	__OVS_CT_ATTR_MAX
 };
 
 #define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1)
 
+/**
+ * enum ovs_nat_attr - Attributes for %OVS_CT_ATTR_NAT.
+ *
+ * @OVS_NAT_ATTR_SRC: Flag for Source NAT (mangle source address/port).
+ * @OVS_NAT_ATTR_DST: Flag for Destination NAT (mangle destination
+ * address/port).  Only one of (@OVS_NAT_ATTR_SRC, @OVS_NAT_ATTR_DST) may be
+ * specified.  Effective only for packets for ct_state NEW connections.
+ * Packets of committed connections are mangled by the NAT action according to
+ * the committed NAT type regardless of the flags specified.  As a corollary, a
+ * NAT action without a NAT type flag will only mangle packets of committed
+ * connections.  The following NAT attributes only apply for NEW
+ * (non-committed) connections, and they may be included only when the CT
+ * action has the @OVS_CT_ATTR_COMMIT flag and either @OVS_NAT_ATTR_SRC or
+ * @OVS_NAT_ATTR_DST is also included.
+ * @OVS_NAT_ATTR_IP_MIN: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_IP_MAX: struct in_addr or struct in6_addr
+ * @OVS_NAT_ATTR_PROTO_MIN: u16 L4 protocol specific lower boundary (port)
+ * @OVS_NAT_ATTR_PROTO_MAX: u16 L4 protocol specific upper boundary (port)
+ * @OVS_NAT_ATTR_PERSISTENT: Flag for persistent IP mapping across reboots
+ * @OVS_NAT_ATTR_PROTO_HASH: Flag for pseudo random L4 port mapping (MD5)
+ * @OVS_NAT_ATTR_PROTO_RANDOM: Flag for fully randomized L4 port mapping
+ */
+enum ovs_nat_attr {
+	OVS_NAT_ATTR_UNSPEC,
+	OVS_NAT_ATTR_SRC,
+	OVS_NAT_ATTR_DST,
+	OVS_NAT_ATTR_IP_MIN,
+	OVS_NAT_ATTR_IP_MAX,
+	OVS_NAT_ATTR_PROTO_MIN,
+	OVS_NAT_ATTR_PROTO_MAX,
+	OVS_NAT_ATTR_PERSISTENT,
+	OVS_NAT_ATTR_PROTO_HASH,
+	OVS_NAT_ATTR_PROTO_RANDOM,
+	__OVS_NAT_ATTR_MAX,
+};
+
+#define OVS_NAT_ATTR_MAX (__OVS_NAT_ATTR_MAX - 1)
+
 /**
  * enum ovs_action_attr - Action types.
  *
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index cd5fd9d728a7..234a73344c6e 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -6,7 +6,8 @@ config OPENVSWITCH
 	tristate "Open vSwitch"
 	depends on INET
 	depends on !NF_CONNTRACK || \
-		   (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6))
+		   (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
+				     (!NF_NAT || NF_NAT)))
 	select LIBCRC32C
 	select MPLS
 	select NET_MPLS_GSO
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index f718b724e650..dc5eb29fe7d6 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -13,21 +13,31 @@
 
 #include <linux/module.h>
 #include <linux/openvswitch.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
 #include <net/ip.h>
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 
+#ifdef CONFIG_NF_NAT_NEEDED
+#include <linux/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#endif
+
 #include "datapath.h"
 #include "conntrack.h"
 #include "flow.h"
 #include "flow_netlink.h"
 
 struct ovs_ct_len_tbl {
-	size_t maxlen;
-	size_t minlen;
+	int maxlen;
+	int minlen;
 };
 
 /* Metadata mark for masked write to conntrack mark */
@@ -42,15 +52,25 @@ struct md_labels {
 	struct ovs_key_ct_labels mask;
 };
 
+enum ovs_ct_nat {
+	OVS_CT_NAT = 1 << 0,     /* NAT for committed connections only. */
+	OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */
+	OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */
+};
+
 /* Conntrack action context for execution. */
 struct ovs_conntrack_info {
 	struct nf_conntrack_helper *helper;
 	struct nf_conntrack_zone zone;
 	struct nf_conn *ct;
 	u8 commit : 1;
+	u8 nat : 3;                 /* enum ovs_ct_nat */
 	u16 family;
 	struct md_mark mark;
 	struct md_labels labels;
+#ifdef CONFIG_NF_NAT_NEEDED
+	struct nf_nat_range range;  /* Only present for SRC NAT and DST NAT. */
+#endif
 };
 
 static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
@@ -137,12 +157,15 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
 	ovs_ct_get_labels(ct, &key->ct.labels);
 }
 
-/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
- * previously sent the packet to conntrack via the ct action.
+/* Update 'key' based on skb->nfct.  If 'post_ct' is true, then OVS has
+ * previously sent the packet to conntrack via the ct action.  If
+ * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
+ * initialized from the connection status.
  */
 static void ovs_ct_update_key(const struct sk_buff *skb,
 			      const struct ovs_conntrack_info *info,
-			      struct sw_flow_key *key, bool post_ct)
+			      struct sw_flow_key *key, bool post_ct,
+			      bool keep_nat_flags)
 {
 	const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
 	enum ip_conntrack_info ctinfo;
@@ -160,6 +183,14 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
 		 */
 		if (ct->master)
 			state |= OVS_CS_F_RELATED;
+		if (keep_nat_flags) {
+			state |= key->ct.state & OVS_CS_F_NAT_MASK;
+		} else {
+			if (ct->status & IPS_SRC_NAT)
+				state |= OVS_CS_F_SRC_NAT;
+			if (ct->status & IPS_DST_NAT)
+				state |= OVS_CS_F_DST_NAT;
+		}
 		zone = nf_ct_zone(ct);
 	} else if (post_ct) {
 		state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
@@ -174,7 +205,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
  */
 void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
 {
-	ovs_ct_update_key(skb, NULL, key, false);
+	ovs_ct_update_key(skb, NULL, key, false, false);
 }
 
 int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
@@ -263,6 +294,7 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
 	enum ip_conntrack_info ctinfo;
 	unsigned int protoff;
 	struct nf_conn *ct;
+	int err;
 
 	ct = nf_ct_get(skb, &ctinfo);
 	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
@@ -299,7 +331,18 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
 		return NF_DROP;
 	}
 
-	return helper->help(skb, protoff, ct, ctinfo);
+	err = helper->help(skb, protoff, ct, ctinfo);
+	if (err != NF_ACCEPT)
+		return err;
+
+	/* Adjust seqs after helper.  This is needed due to some helpers (e.g.,
+	 * FTP with NAT) adusting the TCP payload size when mangling IP
+	 * addresses and/or port numbers in the text-based control connection.
+	 */
+	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+	    !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+		return NF_DROP;
+	return NF_ACCEPT;
 }
 
 /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
@@ -468,6 +511,200 @@ static bool skb_nfct_cached(struct net *net,
 	return true;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+			      enum ip_conntrack_info ctinfo,
+			      const struct nf_nat_range *range,
+			      enum nf_nat_manip_type maniptype)
+{
+	int hooknum, nh_off, err = NF_ACCEPT;
+
+	nh_off = skb_network_offset(skb);
+	skb_pull(skb, nh_off);
+
+	/* See HOOK2MANIP(). */
+	if (maniptype == NF_NAT_MANIP_SRC)
+		hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+	else
+		hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		if (skb->protocol == htons(ETH_P_IP) &&
+		    ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+							   hooknum))
+				err = NF_DROP;
+			goto push;
+#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
+		} else if (skb->protocol == htons(ETH_P_IPV6)) {
+			__be16 frag_off;
+			u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+			int hdrlen = ipv6_skip_exthdr(skb,
+						      sizeof(struct ipv6hdr),
+						      &nexthdr, &frag_off);
+
+			if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+				if (!nf_nat_icmpv6_reply_translation(skb, ct,
+								     ctinfo,
+								     hooknum,
+								     hdrlen))
+					err = NF_DROP;
+				goto push;
+			}
+#endif
+		}
+		/* Non-ICMP, fall thru to initialize if needed. */
+	case IP_CT_NEW:
+		/* Seen it before?  This can happen for loopback, retrans,
+		 * or local packets.
+		 */
+		if (!nf_nat_initialized(ct, maniptype)) {
+			/* Initialize according to the NAT action. */
+			err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+				/* Action is set up to establish a new
+				 * mapping.
+				 */
+				? nf_nat_setup_info(ct, range, maniptype)
+				: nf_nat_alloc_null_binding(ct, hooknum);
+			if (err != NF_ACCEPT)
+				goto push;
+		}
+		break;
+
+	case IP_CT_ESTABLISHED:
+	case IP_CT_ESTABLISHED_REPLY:
+		break;
+
+	default:
+		err = NF_DROP;
+		goto push;
+	}
+
+	err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+push:
+	skb_push(skb, nh_off);
+
+	return err;
+}
+
+static void ovs_nat_update_key(struct sw_flow_key *key,
+			       const struct sk_buff *skb,
+			       enum nf_nat_manip_type maniptype)
+{
+	if (maniptype == NF_NAT_MANIP_SRC) {
+		__be16 src;
+
+		key->ct.state |= OVS_CS_F_SRC_NAT;
+		if (key->eth.type == htons(ETH_P_IP))
+			key->ipv4.addr.src = ip_hdr(skb)->saddr;
+		else if (key->eth.type == htons(ETH_P_IPV6))
+			memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr,
+			       sizeof(key->ipv6.addr.src));
+		else
+			return;
+
+		if (key->ip.proto == IPPROTO_UDP)
+			src = udp_hdr(skb)->source;
+		else if (key->ip.proto == IPPROTO_TCP)
+			src = tcp_hdr(skb)->source;
+		else if (key->ip.proto == IPPROTO_SCTP)
+			src = sctp_hdr(skb)->source;
+		else
+			return;
+
+		key->tp.src = src;
+	} else {
+		__be16 dst;
+
+		key->ct.state |= OVS_CS_F_DST_NAT;
+		if (key->eth.type == htons(ETH_P_IP))
+			key->ipv4.addr.dst = ip_hdr(skb)->daddr;
+		else if (key->eth.type == htons(ETH_P_IPV6))
+			memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr,
+			       sizeof(key->ipv6.addr.dst));
+		else
+			return;
+
+		if (key->ip.proto == IPPROTO_UDP)
+			dst = udp_hdr(skb)->dest;
+		else if (key->ip.proto == IPPROTO_TCP)
+			dst = tcp_hdr(skb)->dest;
+		else if (key->ip.proto == IPPROTO_SCTP)
+			dst = sctp_hdr(skb)->dest;
+		else
+			return;
+
+		key->tp.dst = dst;
+	}
+}
+
+/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+		      const struct ovs_conntrack_info *info,
+		      struct sk_buff *skb, struct nf_conn *ct,
+		      enum ip_conntrack_info ctinfo)
+{
+	enum nf_nat_manip_type maniptype;
+	int err;
+
+	if (nf_ct_is_untracked(ct)) {
+		/* A NAT action may only be performed on tracked packets. */
+		return NF_ACCEPT;
+	}
+
+	/* Add NAT extension if not confirmed yet. */
+	if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+		return NF_ACCEPT;   /* Can't NAT. */
+
+	/* Determine NAT type.
+	 * Check if the NAT type can be deduced from the tracked connection.
+	 * Make sure expected traffic is NATted only when committing.
+	 */
+	if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW &&
+	    ct->status & IPS_NAT_MASK &&
+	    (!(ct->status & IPS_EXPECTED_BIT) || info->commit)) {
+		/* NAT an established or related connection like before. */
+		if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+			/* This is the REPLY direction for a connection
+			 * for which NAT was applied in the forward
+			 * direction.  Do the reverse NAT.
+			 */
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+		else
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+	} else if (info->nat & OVS_CT_SRC_NAT) {
+		maniptype = NF_NAT_MANIP_SRC;
+	} else if (info->nat & OVS_CT_DST_NAT) {
+		maniptype = NF_NAT_MANIP_DST;
+	} else {
+		return NF_ACCEPT; /* Connection is not NATed. */
+	}
+	err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
+
+	/* Mark NAT done if successful and update the flow key. */
+	if (err == NF_ACCEPT)
+		ovs_nat_update_key(key, skb, maniptype);
+
+	return err;
+}
+#else /* !CONFIG_NF_NAT_NEEDED */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+		      const struct ovs_conntrack_info *info,
+		      struct sk_buff *skb, struct nf_conn *ct,
+		      enum ip_conntrack_info ctinfo)
+{
+	return NF_ACCEPT;
+}
+#endif
+
 /* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
  * not done already.  Update key with new CT state after passing the packet
  * through conntrack.
@@ -509,19 +746,43 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 		if (err != NF_ACCEPT)
 			return -ENOENT;
 
-		ovs_ct_update_key(skb, info, key, true);
+		/* Clear CT state NAT flags to mark that we have not yet done
+		 * NAT after the nf_conntrack_in() call.  We can actually clear
+		 * the whole state, as it will be re-initialized below.
+		 */
+		key->ct.state = 0;
+
+		/* Update the key, but keep the NAT flags. */
+		ovs_ct_update_key(skb, info, key, true, true);
 	}
 
-	/* Call the helper only if:
-	 * - nf_conntrack_in() was executed above ("!cached") for a confirmed
-	 *   connection, or
-	 * - When committing an unconfirmed connection.
-	 */
 	ct = nf_ct_get(skb, &ctinfo);
-	if (ct && (nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
-	    ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
-		WARN_ONCE(1, "helper rejected packet");
-		return -EINVAL;
+	if (ct) {
+		/* Packets starting a new connection must be NATted before the
+		 * helper, so that the helper knows about the NAT.  We enforce
+		 * this by delaying both NAT and helper calls for unconfirmed
+		 * connections until the committing CT action.  For later
+		 * packets NAT and Helper may be called in either order.
+		 *
+		 * NAT will be done only if the CT action has NAT, and only
+		 * once per packet (per zone), as guarded by the NAT bits in
+		 * the key->ct.state.
+		 */
+		if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
+		    (nf_ct_is_confirmed(ct) || info->commit) &&
+		    ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
+			return -EINVAL;
+		}
+
+		/* Call the helper only if:
+		 * - nf_conntrack_in() was executed above ("!cached") for a
+		 *   confirmed connection, or
+		 * - When committing an unconfirmed connection.
+		 */
+		if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
+		    ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+			return -EINVAL;
+		}
 	}
 
 	return 0;
@@ -545,15 +806,13 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 	if (exp) {
 		u8 state;
 
+		/* NOTE: New connections are NATted and Helped only when
+		 * committed, so we are not calling into NAT here.
+		 */
 		state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
 		__ovs_ct_update_key(key, state, &info->zone, exp->master);
-	} else {
-		int err;
-
-		err = __ovs_ct_lookup(net, key, info, skb);
-		if (err)
-			return err;
-	}
+	} else
+		return __ovs_ct_lookup(net, key, info, skb);
 
 	return 0;
 }
@@ -653,6 +912,135 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
 	return 0;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+static int parse_nat(const struct nlattr *attr,
+		     struct ovs_conntrack_info *info, bool log)
+{
+	struct nlattr *a;
+	int rem;
+	bool have_ip_max = false;
+	bool have_proto_max = false;
+	bool ip_vers = (info->family == NFPROTO_IPV6);
+
+	nla_for_each_nested(a, attr, rem) {
+		static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
+			[OVS_NAT_ATTR_SRC] = {0, 0},
+			[OVS_NAT_ATTR_DST] = {0, 0},
+			[OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
+						 sizeof(struct in6_addr)},
+			[OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
+						 sizeof(struct in6_addr)},
+			[OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)},
+			[OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)},
+			[OVS_NAT_ATTR_PERSISTENT] = {0, 0},
+			[OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
+			[OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
+		};
+		int type = nla_type(a);
+
+		if (type > OVS_NAT_ATTR_MAX) {
+			OVS_NLERR(log,
+				  "Unknown NAT attribute (type=%d, max=%d).\n",
+				  type, OVS_NAT_ATTR_MAX);
+			return -EINVAL;
+		}
+
+		if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) {
+			OVS_NLERR(log,
+				  "NAT attribute type %d has unexpected length (%d != %d).\n",
+				  type, nla_len(a),
+				  ovs_nat_attr_lens[type][ip_vers]);
+			return -EINVAL;
+		}
+
+		switch (type) {
+		case OVS_NAT_ATTR_SRC:
+		case OVS_NAT_ATTR_DST:
+			if (info->nat) {
+				OVS_NLERR(log,
+					  "Only one type of NAT may be specified.\n"
+					  );
+				return -ERANGE;
+			}
+			info->nat |= OVS_CT_NAT;
+			info->nat |= ((type == OVS_NAT_ATTR_SRC)
+					? OVS_CT_SRC_NAT : OVS_CT_DST_NAT);
+			break;
+
+		case OVS_NAT_ATTR_IP_MIN:
+			nla_memcpy(&info->range.min_addr, a, nla_len(a));
+			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+			break;
+
+		case OVS_NAT_ATTR_IP_MAX:
+			have_ip_max = true;
+			nla_memcpy(&info->range.max_addr, a,
+				   sizeof(info->range.max_addr));
+			info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_MIN:
+			info->range.min_proto.all = htons(nla_get_u16(a));
+			info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_MAX:
+			have_proto_max = true;
+			info->range.max_proto.all = htons(nla_get_u16(a));
+			info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+			break;
+
+		case OVS_NAT_ATTR_PERSISTENT:
+			info->range.flags |= NF_NAT_RANGE_PERSISTENT;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_HASH:
+			info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM;
+			break;
+
+		case OVS_NAT_ATTR_PROTO_RANDOM:
+			info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY;
+			break;
+
+		default:
+			OVS_NLERR(log, "Unknown nat attribute (%d).\n", type);
+			return -EINVAL;
+		}
+	}
+
+	if (rem > 0) {
+		OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem);
+		return -EINVAL;
+	}
+	if (!info->nat) {
+		/* Do not allow flags if no type is given. */
+		if (info->range.flags) {
+			OVS_NLERR(log,
+				  "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n"
+				  );
+			return -EINVAL;
+		}
+		info->nat = OVS_CT_NAT;   /* NAT existing connections. */
+	} else if (!info->commit) {
+		OVS_NLERR(log,
+			  "NAT attributes may be specified only when CT COMMIT flag is also specified.\n"
+			  );
+		return -EINVAL;
+	}
+	/* Allow missing IP_MAX. */
+	if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) {
+		memcpy(&info->range.max_addr, &info->range.min_addr,
+		       sizeof(info->range.max_addr));
+	}
+	/* Allow missing PROTO_MAX. */
+	if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+	    !have_proto_max) {
+		info->range.max_proto.all = info->range.min_proto.all;
+	}
+	return 0;
+}
+#endif
+
 static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_COMMIT]	= { .minlen = 0, .maxlen = 0 },
 	[OVS_CT_ATTR_ZONE]	= { .minlen = sizeof(u16),
@@ -662,7 +1050,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
 	[OVS_CT_ATTR_LABELS]	= { .minlen = sizeof(struct md_labels),
 				    .maxlen = sizeof(struct md_labels) },
 	[OVS_CT_ATTR_HELPER]	= { .minlen = 1,
-				    .maxlen = NF_CT_HELPER_NAME_LEN }
+				    .maxlen = NF_CT_HELPER_NAME_LEN },
+#ifdef CONFIG_NF_NAT_NEEDED
+	/* NAT length is checked when parsing the nested attributes. */
+	[OVS_CT_ATTR_NAT]	= { .minlen = 0, .maxlen = INT_MAX },
+#endif
 };
 
 static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -729,6 +1121,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
 				return -EINVAL;
 			}
 			break;
+#ifdef CONFIG_NF_NAT_NEEDED
+		case OVS_CT_ATTR_NAT: {
+			int err = parse_nat(a, info, log);
+
+			if (err)
+				return err;
+			break;
+		}
+#endif
 		default:
 			OVS_NLERR(log, "Unknown conntrack attr (%d)",
 				  type);
@@ -816,6 +1217,74 @@ err_free_ct:
 	return err;
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
+			       struct sk_buff *skb)
+{
+	struct nlattr *start;
+
+	start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
+	if (!start)
+		return false;
+
+	if (info->nat & OVS_CT_SRC_NAT) {
+		if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
+			return false;
+	} else if (info->nat & OVS_CT_DST_NAT) {
+		if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
+			return false;
+	} else {
+		goto out;
+	}
+
+	if (info->range.flags & NF_NAT_RANGE_MAP_IPS) {
+		if (info->family == NFPROTO_IPV4) {
+			if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN,
+					    info->range.min_addr.ip) ||
+			    (info->range.max_addr.ip
+			     != info->range.min_addr.ip &&
+			     (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX,
+					      info->range.max_addr.ip))))
+				return false;
+#if IS_ENABLED(CONFIG_NF_NAT_IPV6)
+		} else if (info->family == NFPROTO_IPV6) {
+			if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN,
+					     &info->range.min_addr.in6) ||
+			    (memcmp(&info->range.max_addr.in6,
+				    &info->range.min_addr.in6,
+				    sizeof(info->range.max_addr.in6)) &&
+			     (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX,
+					       &info->range.max_addr.in6))))
+				return false;
+#endif
+		} else {
+			return false;
+		}
+	}
+	if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+	    (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN,
+			 ntohs(info->range.min_proto.all)) ||
+	     (info->range.max_proto.all != info->range.min_proto.all &&
+	      nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX,
+			  ntohs(info->range.max_proto.all)))))
+		return false;
+
+	if (info->range.flags & NF_NAT_RANGE_PERSISTENT &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT))
+		return false;
+	if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH))
+		return false;
+	if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY &&
+	    nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM))
+		return false;
+out:
+	nla_nest_end(skb, start);
+
+	return true;
+}
+#endif
+
 int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 			  struct sk_buff *skb)
 {
@@ -844,7 +1313,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
 				   ct_info->helper->name))
 			return -EMSGSIZE;
 	}
-
+#ifdef CONFIG_NF_NAT_NEEDED
+	if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
+		return -EMSGSIZE;
+#endif
 	nla_nest_end(skb, start);
 
 	return 0;
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index a7544f405c16..8f6230bd6183 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a);
 
 #define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
 			   OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \
-			   OVS_CS_F_INVALID | OVS_CS_F_TRACKED)
+			   OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \
+			   OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
 #else
 #include <linux/errno.h>
 
-- 
cgit v1.2.3


From e9a7c2f1a548f34bcaa7640094201e8b29247940 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 15 Mar 2016 14:58:25 -0700
Subject: autofs4: coding style fixes

Try and make the coding style completely consistent throughtout the
autofs module and inline with kernel coding style recommendations.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h          | 55 ++++++++++++------------
 fs/autofs4/dev-ioctl.c         | 28 ++++++++-----
 fs/autofs4/expire.c            | 60 ++++++++++++++------------
 fs/autofs4/init.c              | 10 ++---
 fs/autofs4/inode.c             | 25 ++++++-----
 fs/autofs4/root.c              | 95 +++++++++++++++++++++++-------------------
 fs/autofs4/symlink.c           | 11 ++---
 fs/autofs4/waitq.c             | 49 ++++++++++++----------
 include/linux/auto_dev-ioctl.h |  1 -
 include/linux/auto_fs.h        | 10 ++---
 include/uapi/linux/auto_fs.h   | 19 ++++-----
 include/uapi/linux/auto_fs4.h  | 15 ++-----
 12 files changed, 190 insertions(+), 188 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index c37149b929be..e50cfae487b2 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -1,15 +1,11 @@
-/* -*- c -*- ------------------------------------------------------------- *
- *   
- * linux/fs/autofs/autofs_i.h
- *
- *   Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
- *   Copyright 2005-2006 Ian Kent <raven@themaw.net>
+/*
+ *  Copyright 1997-1998 Transmeta Corporation - All Rights Reserved
+ *  Copyright 2005-2006 Ian Kent <raven@themaw.net>
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
  * option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
+ */
 
 /* Internal header file for autofs */
 
@@ -35,7 +31,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* #define DEBUG */
 
@@ -51,12 +47,14 @@
 	printk(KERN_ERR "pid %d: %s: " fmt "\n",	\
 		current->pid, __func__, ##__VA_ARGS__)
 
-/* Unified info structure.  This is pointed to by both the dentry and
-   inode structures.  Each file in the filesystem has an instance of this
-   structure.  It holds a reference to the dentry, so dentries are never
-   flushed while the file exists.  All name lookups are dealt with at the
-   dentry level, although the filesystem can interfere in the validation
-   process.  Readdir is implemented by traversing the dentry lists. */
+/*
+ * Unified info structure.  This is pointed to by both the dentry and
+ * inode structures.  Each file in the filesystem has an instance of this
+ * structure.  It holds a reference to the dentry, so dentries are never
+ * flushed while the file exists.  All name lookups are dealt with at the
+ * dentry level, although the filesystem can interfere in the validation
+ * process.  Readdir is implemented by traversing the dentry lists.
+ */
 struct autofs_info {
 	struct dentry	*dentry;
 	struct inode	*inode;
@@ -78,7 +76,7 @@ struct autofs_info {
 	kgid_t gid;
 };
 
-#define AUTOFS_INF_EXPIRING	(1<<0) /* dentry is in the process of expiring */
+#define AUTOFS_INF_EXPIRING	(1<<0) /* dentry in the process of expiring */
 #define AUTOFS_INF_NO_RCU	(1<<1) /* the dentry is being considered
 					* for expiry, so RCU_walk is
 					* not permitted
@@ -140,10 +138,11 @@ static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry)
 }
 
 /* autofs4_oz_mode(): do we see the man behind the curtain?  (The
-   processes which do manipulations for us in user space sees the raw
-   filesystem without "magic".) */
-
-static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) {
+ * processes which do manipulations for us in user space sees the raw
+ * filesystem without "magic".)
+ */
+static inline int autofs4_oz_mode(struct autofs_sb_info *sbi)
+{
 	return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp;
 }
 
@@ -154,12 +153,12 @@ void autofs4_free_ino(struct autofs_info *);
 int is_autofs4_dentry(struct dentry *);
 int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
 int autofs4_expire_run(struct super_block *, struct vfsmount *,
-			struct autofs_sb_info *,
-			struct autofs_packet_expire __user *);
+		       struct autofs_sb_info *,
+		       struct autofs_packet_expire __user *);
 int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 			    struct autofs_sb_info *sbi, int when);
 int autofs4_expire_multi(struct super_block *, struct vfsmount *,
-			struct autofs_sb_info *, int __user *);
+			 struct autofs_sb_info *, int __user *);
 struct dentry *autofs4_expire_direct(struct super_block *sb,
 				     struct vfsmount *mnt,
 				     struct autofs_sb_info *sbi, int how);
@@ -224,8 +223,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
 
 /* Queue management functions */
 
-int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
-int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
+int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify);
+int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
 
 static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi)
@@ -242,37 +241,37 @@ static inline void __autofs4_add_expiring(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
 	if (ino) {
 		if (list_empty(&ino->expiring))
 			list_add(&ino->expiring, &sbi->expiring_list);
 	}
-	return;
 }
 
 static inline void autofs4_add_expiring(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
 	if (ino) {
 		spin_lock(&sbi->lookup_lock);
 		if (list_empty(&ino->expiring))
 			list_add(&ino->expiring, &sbi->expiring_list);
 		spin_unlock(&sbi->lookup_lock);
 	}
-	return;
 }
 
 static inline void autofs4_del_expiring(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+
 	if (ino) {
 		spin_lock(&sbi->lookup_lock);
 		if (!list_empty(&ino->expiring))
 			list_del_init(&ino->expiring);
 		spin_unlock(&sbi->lookup_lock);
 	}
-	return;
 }
 
 extern void autofs4_kill_sb(struct super_block *);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index ac7d921ed984..c64b9fa839c5 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -72,8 +72,8 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
 {
 	int err = 0;
 
-	if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) ||
-	    (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) {
+	if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) ||
+	    (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) {
 		AUTOFS_WARN("ioctl control interface version mismatch: "
 		     "kernel(%u.%u), user(%u.%u), cmd(%d)",
 		     AUTOFS_DEV_IOCTL_VERSION_MAJOR,
@@ -93,7 +93,8 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
  * Copy parameter control struct, including a possible path allocated
  * at the end of the struct.
  */
-static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
+static struct autofs_dev_ioctl *
+		copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
 {
 	struct autofs_dev_ioctl tmp, *res;
 
@@ -116,7 +117,6 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
 static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
 {
 	kfree(param);
-	return;
 }
 
 /*
@@ -197,7 +197,9 @@ static int find_autofs_mount(const char *pathname,
 			     void *data)
 {
 	struct path path;
-	int err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
+	int err;
+
+	err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0);
 	if (err)
 		return err;
 	err = -ENOENT;
@@ -225,6 +227,7 @@ static int test_by_dev(struct path *path, void *p)
 static int test_by_type(struct path *path, void *p)
 {
 	struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
+
 	return ino && ino->sbi->type & *(unsigned *)p;
 }
 
@@ -456,8 +459,10 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 		err = 0;
 		autofs4_expire_wait(path.dentry, 0);
 		spin_lock(&sbi->fs_lock);
-		param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid);
-		param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid);
+		param->requester.uid =
+			from_kuid_munged(current_user_ns(), ino->uid);
+		param->requester.gid =
+			from_kgid_munged(current_user_ns(), ino->gid);
 		spin_unlock(&sbi->fs_lock);
 	}
 	path_put(&path);
@@ -619,7 +624,8 @@ static ioctl_fn lookup_dev_ioctl(unsigned int cmd)
 }
 
 /* ioctl dispatcher */
-static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user)
+static int _autofs_dev_ioctl(unsigned int command,
+			     struct autofs_dev_ioctl __user *user)
 {
 	struct autofs_dev_ioctl *param;
 	struct file *fp;
@@ -711,6 +717,7 @@ out:
 static long autofs_dev_ioctl(struct file *file, uint command, ulong u)
 {
 	int err;
+
 	err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u);
 	return (long) err;
 }
@@ -733,8 +740,8 @@ static const struct file_operations _dev_ioctl_fops = {
 
 static struct miscdevice _autofs_dev_ioctl_misc = {
 	.minor		= AUTOFS_MINOR,
-	.name  		= AUTOFS_DEVICE_NAME,
-	.fops  		= &_dev_ioctl_fops
+	.name		= AUTOFS_DEVICE_NAME,
+	.fops		= &_dev_ioctl_fops
 };
 
 MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
@@ -757,6 +764,5 @@ int __init autofs_dev_ioctl_init(void)
 void autofs_dev_ioctl_exit(void)
 {
 	misc_deregister(&_autofs_dev_ioctl_misc);
-	return;
 }
 
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 1cebc3c52fa5..8ba37c73b372 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -1,16 +1,12 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/expire.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *  Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
  * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
 
 #include "autofs_i.h"
 
@@ -18,7 +14,7 @@ static unsigned long now;
 
 /* Check if a dentry can be expired */
 static inline int autofs4_can_expire(struct dentry *dentry,
-					unsigned long timeout, int do_now)
+				     unsigned long timeout, int do_now)
 {
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
 
@@ -58,7 +54,9 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 
 	/* Update the expiry counter if fs is busy */
 	if (!may_umount_tree(path.mnt)) {
-		struct autofs_info *ino = autofs4_dentry_ino(top);
+		struct autofs_info *ino;
+
+		ino = autofs4_dentry_ino(top);
 		ino->last_used = jiffies;
 		goto done;
 	}
@@ -74,7 +72,7 @@ done:
  * Calculate and dget next entry in the subdirs list under root.
  */
 static struct dentry *get_next_positive_subdir(struct dentry *prev,
-						struct dentry *root)
+					       struct dentry *root)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
 	struct list_head *next;
@@ -121,7 +119,7 @@ cont:
  * Calculate and dget next entry in top down tree traversal.
  */
 static struct dentry *get_next_positive_dentry(struct dentry *prev,
-						struct dentry *root)
+					       struct dentry *root)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb);
 	struct list_head *next;
@@ -187,15 +185,17 @@ again:
  * autofs submounts.
  */
 static int autofs4_direct_busy(struct vfsmount *mnt,
-				struct dentry *top,
-				unsigned long timeout,
-				int do_now)
+			       struct dentry *top,
+			       unsigned long timeout,
+			       int do_now)
 {
 	DPRINTK("top %p %pd", top, top);
 
 	/* If it's busy update the expiry counters */
 	if (!may_umount_tree(mnt)) {
-		struct autofs_info *ino = autofs4_dentry_ino(top);
+		struct autofs_info *ino;
+
+		ino = autofs4_dentry_ino(top);
 		if (ino)
 			ino->last_used = jiffies;
 		return 1;
@@ -208,7 +208,8 @@ static int autofs4_direct_busy(struct vfsmount *mnt,
 	return 0;
 }
 
-/* Check a directory tree of mount points for busyness
+/*
+ * Check a directory tree of mount points for busyness
  * The tree is not busy iff no mountpoints are busy
  */
 static int autofs4_tree_busy(struct vfsmount *mnt,
@@ -404,6 +405,7 @@ static struct dentry *should_expire(struct dentry *dentry,
 	} else {
 		/* Path walk currently on this dentry? */
 		struct dentry *expired;
+
 		ino_count = atomic_read(&ino->count) + 1;
 		if (d_count(dentry) > ino_count)
 			return NULL;
@@ -522,21 +524,22 @@ int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
 
 /* Perform an expiry operation */
 int autofs4_expire_run(struct super_block *sb,
-		      struct vfsmount *mnt,
-		      struct autofs_sb_info *sbi,
-		      struct autofs_packet_expire __user *pkt_p)
+		       struct vfsmount *mnt,
+		       struct autofs_sb_info *sbi,
+		       struct autofs_packet_expire __user *pkt_p)
 {
 	struct autofs_packet_expire pkt;
 	struct autofs_info *ino;
 	struct dentry *dentry;
 	int ret = 0;
 
-	memset(&pkt,0,sizeof pkt);
+	memset(&pkt, 0, sizeof(pkt));
 
 	pkt.hdr.proto_version = sbi->version;
 	pkt.hdr.type = autofs_ptype_expire;
 
-	if ((dentry = autofs4_expire_indirect(sb, mnt, sbi, 0)) == NULL)
+	dentry = autofs4_expire_indirect(sb, mnt, sbi, 0);
+	if (!dentry)
 		return -EAGAIN;
 
 	pkt.len = dentry->d_name.len;
@@ -544,7 +547,7 @@ int autofs4_expire_run(struct super_block *sb,
 	pkt.name[pkt.len] = '\0';
 	dput(dentry);
 
-	if ( copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)) )
+	if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire)))
 		ret = -EFAULT;
 
 	spin_lock(&sbi->fs_lock);
@@ -573,7 +576,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 		struct autofs_info *ino = autofs4_dentry_ino(dentry);
 
 		/* This is synchronous because it makes the daemon a
-                   little easier */
+		 * little easier
+		 */
 		ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
 
 		spin_lock(&sbi->fs_lock);
@@ -588,8 +592,10 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 	return ret;
 }
 
-/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
-   more to be done */
+/*
+ * Call repeatedly until it returns -EAGAIN, meaning there's nothing
+ * more to be done.
+ */
 int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 			struct autofs_sb_info *sbi, int __user *arg)
 {
diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c
index b3db517e89ec..8cf0e63389ae 100644
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -1,14 +1,10 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/init.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
  * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
 
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 4320faa2d2dc..ad03705aac43 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -1,15 +1,11 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/inode.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *  Copyright 2005-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 2005-2006 Ian Kent <raven@themaw.net>
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
  * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
 
 #include <linux/kernel.h>
 #include <linux/slab.h>
@@ -24,7 +20,9 @@
 
 struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi)
 {
-	struct autofs_info *ino = kzalloc(sizeof(*ino), GFP_KERNEL);
+	struct autofs_info *ino;
+
+	ino = kzalloc(sizeof(*ino), GFP_KERNEL);
 	if (ino) {
 		INIT_LIST_HEAD(&ino->active);
 		INIT_LIST_HEAD(&ino->expiring);
@@ -152,6 +150,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
 
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
+
 		if (!*p)
 			continue;
 
@@ -209,9 +208,9 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
 
 int autofs4_fill_super(struct super_block *s, void *data, int silent)
 {
-	struct inode * root_inode;
-	struct dentry * root;
-	struct file * pipe;
+	struct inode *root_inode;
+	struct dentry *root;
+	struct file *pipe;
 	int pipefd;
 	struct autofs_sb_info *sbi;
 	struct autofs_info *ino;
@@ -222,7 +221,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
-	DPRINTK("starting up, sbi = %p",sbi);
+	DPRINTK("starting up, sbi = %p", sbi);
 
 	s->s_fs_info = sbi;
 	sbi->magic = AUTOFS_SBI_MAGIC;
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index c6d7d3dbd52a..bdeb8838a901 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -1,16 +1,12 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/root.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *  Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
- *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
  * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
 
 #include <linux/capability.h>
 #include <linux/errno.h>
@@ -23,16 +19,18 @@
 
 #include "autofs_i.h"
 
-static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
-static int autofs4_dir_unlink(struct inode *,struct dentry *);
-static int autofs4_dir_rmdir(struct inode *,struct dentry *);
-static int autofs4_dir_mkdir(struct inode *,struct dentry *,umode_t);
-static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
+static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *);
+static int autofs4_dir_unlink(struct inode *, struct dentry *);
+static int autofs4_dir_rmdir(struct inode *, struct dentry *);
+static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t);
+static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long);
 #ifdef CONFIG_COMPAT
-static long autofs4_root_compat_ioctl(struct file *,unsigned int,unsigned long);
+static long autofs4_root_compat_ioctl(struct file *,
+				      unsigned int, unsigned long);
 #endif
 static int autofs4_dir_open(struct inode *inode, struct file *file);
-static struct dentry *autofs4_lookup(struct inode *,struct dentry *, unsigned int);
+static struct dentry *autofs4_lookup(struct inode *,
+				     struct dentry *, unsigned int);
 static struct vfsmount *autofs4_d_automount(struct path *);
 static int autofs4_d_manage(struct dentry *, bool);
 static void autofs4_dentry_release(struct dentry *);
@@ -74,7 +72,9 @@ const struct dentry_operations autofs4_dentry_operations = {
 static void autofs4_add_active(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct autofs_info *ino;
+
+	ino = autofs4_dentry_ino(dentry);
 	if (ino) {
 		spin_lock(&sbi->lookup_lock);
 		if (!ino->active_count) {
@@ -84,13 +84,14 @@ static void autofs4_add_active(struct dentry *dentry)
 		ino->active_count++;
 		spin_unlock(&sbi->lookup_lock);
 	}
-	return;
 }
 
 static void autofs4_del_active(struct dentry *dentry)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct autofs_info *ino;
+
+	ino = autofs4_dentry_ino(dentry);
 	if (ino) {
 		spin_lock(&sbi->lookup_lock);
 		ino->active_count--;
@@ -100,7 +101,6 @@ static void autofs4_del_active(struct dentry *dentry)
 		}
 		spin_unlock(&sbi->lookup_lock);
 	}
-	return;
 }
 
 static int autofs4_dir_open(struct inode *inode, struct file *file)
@@ -320,7 +320,9 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path)
 	if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) {
 		struct dentry *parent = dentry->d_parent;
 		struct autofs_info *ino;
-		struct dentry *new = d_lookup(parent, &dentry->d_name);
+		struct dentry *new;
+
+		new = d_lookup(parent, &dentry->d_name);
 		if (!new)
 			return NULL;
 		ino = autofs4_dentry_ino(new);
@@ -455,6 +457,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 		 * a mount-trap.
 		 */
 		struct inode *inode;
+
 		if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))
 			return 0;
 		if (d_mountpoint(dentry))
@@ -494,7 +497,8 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 }
 
 /* Lookups in the root directory */
-static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
+static struct dentry *autofs4_lookup(struct inode *dir,
+				     struct dentry *dentry, unsigned int flags)
 {
 	struct autofs_sb_info *sbi;
 	struct autofs_info *ino;
@@ -513,9 +517,9 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
 		autofs4_oz_mode(sbi));
 
 	active = autofs4_lookup_active(dentry);
-	if (active) {
+	if (active)
 		return active;
-	} else {
+	else {
 		/*
 		 * A dentry that is not within the root can never trigger a
 		 * mount operation, unless the directory already exists, so we
@@ -526,7 +530,8 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, u
 			return ERR_PTR(-ENOENT);
 
 		/* Mark entries in the root as mount triggers */
-		if (autofs_type_indirect(sbi->type) && IS_ROOT(dentry->d_parent))
+		if (IS_ROOT(dentry->d_parent) &&
+		    autofs_type_indirect(sbi->type))
 			__managed_dentry_set_managed(dentry);
 
 		ino = autofs4_new_ino(sbi);
@@ -664,7 +669,6 @@ static void autofs_set_leaf_automount_flags(struct dentry *dentry)
 	if (IS_ROOT(parent->d_parent))
 		return;
 	managed_dentry_clear_managed(parent);
-	return;
 }
 
 static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
@@ -687,7 +691,6 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry)
 	if (d_child->next == &parent->d_subdirs &&
 	    d_child->prev == &parent->d_subdirs)
 		managed_dentry_set_managed(parent);
-	return;
 }
 
 static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
@@ -728,7 +731,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	return 0;
 }
 
-static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int autofs4_dir_mkdir(struct inode *dir,
+			     struct dentry *dentry, umode_t mode)
 {
 	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -768,7 +772,7 @@ static int autofs4_dir_mkdir(struct inode *dir, struct dentry *dentry, umode_t m
 /* Get/set timeout ioctl() operation */
 #ifdef CONFIG_COMPAT
 static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
-					 compat_ulong_t __user *p)
+						 compat_ulong_t __user *p)
 {
 	int rv;
 	unsigned long ntimeout;
@@ -787,7 +791,7 @@ static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi,
 #endif
 
 static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
-					 unsigned long __user *p)
+					  unsigned long __user *p)
 {
 	int rv;
 	unsigned long ntimeout;
@@ -805,13 +809,15 @@ static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi,
 }
 
 /* Return protocol version */
-static inline int autofs4_get_protover(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protover(struct autofs_sb_info *sbi,
+				       int __user *p)
 {
 	return put_user(sbi->version, p);
 }
 
 /* Return protocol sub version */
-static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, int __user *p)
+static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi,
+					  int __user *p)
 {
 	return put_user(sbi->sub_version, p);
 }
@@ -834,9 +840,9 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p)
 }
 
 /* Identify autofs4_dentries - this is so we can tell if there's
-   an extra dentry refcount or not.  We only hold a refcount on the
-   dentry if its non-negative (ie, d_inode != NULL)
-*/
+ * an extra dentry refcount or not.  We only hold a refcount on the
+ * dentry if its non-negative (ie, d_inode != NULL)
+ */
 int is_autofs4_dentry(struct dentry *dentry)
 {
 	return dentry && d_really_is_positive(dentry) &&
@@ -855,7 +861,7 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 	void __user *p = (void __user *)arg;
 
 	DPRINTK("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u",
-		cmd,arg,sbi,task_pgrp_nr(current));
+		cmd, arg, sbi, task_pgrp_nr(current));
 
 	if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
 	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
@@ -864,11 +870,11 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	
-	switch(cmd) {
+	switch (cmd) {
 	case AUTOFS_IOC_READY:	/* Wait queue: go ahead and retry */
-		return autofs4_wait_release(sbi,(autofs_wqt_t)arg,0);
+		return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0);
 	case AUTOFS_IOC_FAIL:	/* Wait queue: fail with ENOENT */
-		return autofs4_wait_release(sbi,(autofs_wqt_t)arg,-ENOENT);
+		return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT);
 	case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */
 		autofs4_catatonic_mode(sbi);
 		return 0;
@@ -888,10 +894,12 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 
 	/* return a single thing to expire */
 	case AUTOFS_IOC_EXPIRE:
-		return autofs4_expire_run(inode->i_sb,filp->f_path.mnt,sbi, p);
+		return autofs4_expire_run(inode->i_sb,
+					  filp->f_path.mnt, sbi, p);
 	/* same as above, but can send multiple expires through pipe */
 	case AUTOFS_IOC_EXPIRE_MULTI:
-		return autofs4_expire_multi(inode->i_sb,filp->f_path.mnt,sbi, p);
+		return autofs4_expire_multi(inode->i_sb,
+					    filp->f_path.mnt, sbi, p);
 
 	default:
 		return -ENOSYS;
@@ -902,12 +910,13 @@ static long autofs4_root_ioctl(struct file *filp,
 			       unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
+
 	return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
 }
 
 #ifdef CONFIG_COMPAT
 static long autofs4_root_compat_ioctl(struct file *filp,
-			     unsigned int cmd, unsigned long arg)
+				      unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
 	int ret;
@@ -916,7 +925,7 @@ static long autofs4_root_compat_ioctl(struct file *filp,
 		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
 	else
 		ret = autofs4_root_ioctl_unlocked(inode, filp, cmd,
-			(unsigned long)compat_ptr(arg));
+					      (unsigned long) compat_ptr(arg));
 
 	return ret;
 }
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index 84e037d1d129..99aab00dc217 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -1,14 +1,10 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/symlink.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
  * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
 
 #include "autofs_i.h"
 
@@ -18,6 +14,7 @@ static const char *autofs4_get_link(struct dentry *dentry,
 {
 	struct autofs_sb_info *sbi;
 	struct autofs_info *ino;
+
 	if (!dentry)
 		return ERR_PTR(-ECHILD);
 	sbi = autofs4_sbi(dentry->d_sb);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 35b755e79c2d..4e0c8d62dc1f 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -1,15 +1,11 @@
-/* -*- c -*- --------------------------------------------------------------- *
- *
- * linux/fs/autofs/waitq.c
- *
- *  Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
- *  Copyright 2001-2006 Ian Kent <raven@themaw.net>
+/*
+ * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved
+ * Copyright 2001-2006 Ian Kent <raven@themaw.net>
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
  * option, any later version, incorporated herein by reference.
- *
- * ------------------------------------------------------------------------- */
+ */
 
 #include <linux/slab.h>
 #include <linux/time.h>
@@ -18,7 +14,8 @@
 #include "autofs_i.h"
 
 /* We make this a static variable rather than a part of the superblock; it
-   is better if we don't reassign numbers easily even across filesystems */
+ * is better if we don't reassign numbers easily even across filesystems
+ */
 static autofs_wqt_t autofs4_next_wait_queue = 1;
 
 /* These are the signals we allow interrupting a pending mount */
@@ -69,17 +66,19 @@ static int autofs4_write(struct autofs_sb_info *sbi,
 	set_fs(KERNEL_DS);
 
 	mutex_lock(&sbi->pipe_mutex);
-	while (bytes &&
-	       (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) {
+	wr = __vfs_write(file, data, bytes, &file->f_pos);
+	while (bytes && wr) {
 		data += wr;
 		bytes -= wr;
+		wr = __vfs_write(file, data, bytes, &file->f_pos);
 	}
 	mutex_unlock(&sbi->pipe_mutex);
 
 	set_fs(fs);
 
 	/* Keep the currently executing process from receiving a
-	   SIGPIPE unless it was already supposed to get one */
+	 * SIGPIPE unless it was already supposed to get one
+	 */
 	if (wr == -EPIPE && !sigpipe) {
 		spin_lock_irqsave(&current->sighand->siglock, flags);
 		sigdelset(&current->pending.signal, SIGPIPE);
@@ -103,9 +102,10 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	size_t pktsz;
 
 	DPRINTK("wait id = 0x%08lx, name = %.*s, type=%d",
-		(unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type);
+		(unsigned long) wq->wait_queue_token,
+		wq->name.len, wq->name.name, type);
 
-	memset(&pkt,0,sizeof pkt); /* For security reasons */
+	memset(&pkt, 0, sizeof(pkt)); /* For security reasons */
 
 	pkt.hdr.proto_version = sbi->version;
 	pkt.hdr.type = type;
@@ -126,7 +126,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	}
 	case autofs_ptype_expire_multi:
 	{
-		struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi;
+		struct autofs_packet_expire_multi *ep =
+					&pkt.v4_pkt.expire_multi;
 
 		pktsz = sizeof(*ep);
 
@@ -231,7 +232,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
 		if (wq->name.hash == qstr->hash &&
 		    wq->name.len == qstr->len &&
 		    wq->name.name &&
-			 !memcmp(wq->name.name, qstr->name, qstr->len))
+		    !memcmp(wq->name.name, qstr->name, qstr->len))
 			break;
 	}
 	return wq;
@@ -248,7 +249,7 @@ autofs4_find_wait(struct autofs_sb_info *sbi, struct qstr *qstr)
 static int validate_request(struct autofs_wait_queue **wait,
 			    struct autofs_sb_info *sbi,
 			    struct qstr *qstr,
-			    struct dentry*dentry, enum autofs_notify notify)
+			    struct dentry *dentry, enum autofs_notify notify)
 {
 	struct autofs_wait_queue *wq;
 	struct autofs_info *ino;
@@ -322,8 +323,10 @@ static int validate_request(struct autofs_wait_queue **wait,
 		 * continue on and create a new request.
 		 */
 		if (!IS_ROOT(dentry)) {
-			if (d_really_is_positive(dentry) && d_unhashed(dentry)) {
+			if (d_unhashed(dentry) &&
+			    d_really_is_positive(dentry)) {
 				struct dentry *parent = dentry->d_parent;
+
 				new = d_lookup(parent, &dentry->d_name);
 				if (new)
 					dentry = new;
@@ -340,8 +343,8 @@ static int validate_request(struct autofs_wait_queue **wait,
 	return 1;
 }
 
-int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
-		enum autofs_notify notify)
+int autofs4_wait(struct autofs_sb_info *sbi,
+		 struct dentry *dentry, enum autofs_notify notify)
 {
 	struct autofs_wait_queue *wq;
 	struct qstr qstr;
@@ -411,7 +414,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 
 	if (!wq) {
 		/* Create a new wait queue */
-		wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL);
+		wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL);
 		if (!wq) {
 			kfree(qstr.name);
 			mutex_unlock(&sbi->wq_mutex);
@@ -454,7 +457,9 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 			(unsigned long) wq->wait_queue_token, wq->name.len,
 			wq->name.name, notify);
 
-		/* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */
+		/*
+		 * autofs4_notify_daemon() may block; it will unlock ->wq_mutex
+		 */
 		autofs4_notify_daemon(sbi, wq, type);
 	} else {
 		wq->wait_ctr++;
diff --git a/include/linux/auto_dev-ioctl.h b/include/linux/auto_dev-ioctl.h
index 850f39b33e74..642781612062 100644
--- a/include/linux/auto_dev-ioctl.h
+++ b/include/linux/auto_dev-ioctl.h
@@ -125,7 +125,6 @@ static inline void init_autofs_dev_ioctl(struct autofs_dev_ioctl *in)
 	in->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR;
 	in->size = sizeof(struct autofs_dev_ioctl);
 	in->ioctlfd = -1;
-	return;
 }
 
 /*
diff --git a/include/linux/auto_fs.h b/include/linux/auto_fs.h
index fcd704d354c4..b4066bb89083 100644
--- a/include/linux/auto_fs.h
+++ b/include/linux/auto_fs.h
@@ -1,14 +1,10 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *   
- * linux/include/linux/auto_fs.h
- *
- *   Copyright 1997 Transmeta Corporation - All Rights Reserved
+/*
+ * Copyright 1997 Transmeta Corporation - All Rights Reserved
  *
  * This file is part of the Linux kernel and is made available under
  * the terms of the GNU General Public License, version 2, or at your
  * option, any later version, incorporated herein by reference.
- *
- * ----------------------------------------------------------------------- */
+ */
 
 #ifndef _LINUX_AUTO_FS_H
 #define _LINUX_AUTO_FS_H
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index bb991dfe134f..5fe176aa61d1 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -1,7 +1,4 @@
-/* -*- linux-c -*- ------------------------------------------------------- *
- *   
- * linux/include/linux/auto_fs.h
- *
+/*
  *   Copyright 1997 Transmeta Corporation - All Rights Reserved
  *
  * This file is part of the Linux kernel and is made available under
@@ -63,12 +60,12 @@ struct autofs_packet_expire {
 	char name[NAME_MAX+1];
 };
 
-#define AUTOFS_IOC_READY      _IO(0x93,0x60)
-#define AUTOFS_IOC_FAIL       _IO(0x93,0x61)
-#define AUTOFS_IOC_CATATONIC  _IO(0x93,0x62)
-#define AUTOFS_IOC_PROTOVER   _IOR(0x93,0x63,int)
-#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93,0x64,compat_ulong_t)
-#define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93,0x64,unsigned long)
-#define AUTOFS_IOC_EXPIRE     _IOR(0x93,0x65,struct autofs_packet_expire)
+#define AUTOFS_IOC_READY      _IO(0x93, 0x60)
+#define AUTOFS_IOC_FAIL       _IO(0x93, 0x61)
+#define AUTOFS_IOC_CATATONIC  _IO(0x93, 0x62)
+#define AUTOFS_IOC_PROTOVER   _IOR(0x93, 0x63, int)
+#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(0x93, 0x64, compat_ulong_t)
+#define AUTOFS_IOC_SETTIMEOUT _IOWR(0x93, 0x64, unsigned long)
+#define AUTOFS_IOC_EXPIRE     _IOR(0x93, 0x65, struct autofs_packet_expire)
 
 #endif /* _UAPI_LINUX_AUTO_FS_H */
diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index e02982fa2953..924fb1adab0b 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -1,6 +1,4 @@
-/* -*- c -*-
- * linux/include/linux/auto_fs4.h
- *
+/*
  * Copyright 1999-2000 Jeremy Fitzhardinge <jeremy@goop.org>
  *
  * This file is part of the Linux kernel and is made available under
@@ -38,7 +36,6 @@
 static inline void set_autofs_type_indirect(unsigned int *type)
 {
 	*type = AUTOFS_TYPE_INDIRECT;
-	return;
 }
 
 static inline unsigned int autofs_type_indirect(unsigned int type)
@@ -49,7 +46,6 @@ static inline unsigned int autofs_type_indirect(unsigned int type)
 static inline void set_autofs_type_direct(unsigned int *type)
 {
 	*type = AUTOFS_TYPE_DIRECT;
-	return;
 }
 
 static inline unsigned int autofs_type_direct(unsigned int type)
@@ -60,7 +56,6 @@ static inline unsigned int autofs_type_direct(unsigned int type)
 static inline void set_autofs_type_offset(unsigned int *type)
 {
 	*type = AUTOFS_TYPE_OFFSET;
-	return;
 }
 
 static inline unsigned int autofs_type_offset(unsigned int type)
@@ -81,7 +76,6 @@ static inline unsigned int autofs_type_trigger(unsigned int type)
 static inline void set_autofs_type_any(unsigned int *type)
 {
 	*type = AUTOFS_TYPE_ANY;
-	return;
 }
 
 static inline unsigned int autofs_type_any(unsigned int type)
@@ -154,11 +148,10 @@ union autofs_v5_packet_union {
 	autofs_packet_expire_direct_t expire_direct;
 };
 
-#define AUTOFS_IOC_EXPIRE_MULTI		_IOW(0x93,0x66,int)
+#define AUTOFS_IOC_EXPIRE_MULTI		_IOW(0x93, 0x66, int)
 #define AUTOFS_IOC_EXPIRE_INDIRECT	AUTOFS_IOC_EXPIRE_MULTI
 #define AUTOFS_IOC_EXPIRE_DIRECT	AUTOFS_IOC_EXPIRE_MULTI
-#define AUTOFS_IOC_PROTOSUBVER		_IOR(0x93,0x67,int)
-#define AUTOFS_IOC_ASKUMOUNT		_IOR(0x93,0x70,int)
-
+#define AUTOFS_IOC_PROTOSUBVER		_IOR(0x93, 0x67, int)
+#define AUTOFS_IOC_ASKUMOUNT		_IOR(0x93, 0x70, int)
 
 #endif /* _LINUX_AUTO_FS4_H */
-- 
cgit v1.2.3


From 0266725ad4ee0f8fcf2ee73be8e68c4adbf2ac79 Mon Sep 17 00:00:00 2001
From: Ian Kent <raven@themaw.net>
Date: Tue, 15 Mar 2016 14:58:36 -0700
Subject: autofs4: fix some white space errors

Fix some white space format errors.

Signed-off-by: Ian Kent <raven@themaw.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/dev-ioctl.c        | 1 -
 fs/autofs4/inode.c            | 2 +-
 fs/autofs4/root.c             | 8 ++++----
 fs/autofs4/waitq.c            | 3 +--
 include/uapi/linux/auto_fs.h  | 2 +-
 include/uapi/linux/auto_fs4.h | 2 +-
 6 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index c64b9fa839c5..b8d0329ba775 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -765,4 +765,3 @@ void autofs_dev_ioctl_exit(void)
 {
 	misc_deregister(&_autofs_dev_ioctl_misc);
 }
-
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index ad03705aac43..7872830d3de9 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -327,7 +327,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	 */
 	s->s_root = root;
 	return 0;
-	
+
 	/*
 	 * Failure ... clean up.
 	 */
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index aa8228eb104b..18c39824a009 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -618,7 +618,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
 	struct autofs_info *p_ino;
-	
+
 	/* This allows root to remove symlinks */
 	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -698,7 +698,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
 	struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
 	struct autofs_info *p_ino;
-	
+
 	DPRINTK("dentry %p, removing %pd", dentry, dentry);
 
 	if (!autofs4_oz_mode(sbi))
@@ -878,10 +878,10 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 	if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) ||
 	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
 		return -ENOTTY;
-	
+
 	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	
+
 	switch (cmd) {
 	case AUTOFS_IOC_READY:	/* Wait queue: go ahead and retry */
 		return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 4aeae3b9f278..a8a94621d813 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -88,7 +88,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
 
 	return (bytes > 0);
 }
-	
+
 static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 				 struct autofs_wait_queue *wq,
 				 int type)
@@ -569,4 +569,3 @@ int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_tok
 
 	return 0;
 }
-
diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h
index 5fe176aa61d1..9175a1b4dc69 100644
--- a/include/uapi/linux/auto_fs.h
+++ b/include/uapi/linux/auto_fs.h
@@ -48,7 +48,7 @@ struct autofs_packet_hdr {
 
 struct autofs_packet_missing {
 	struct autofs_packet_hdr hdr;
-        autofs_wqt_t wait_queue_token;
+	autofs_wqt_t wait_queue_token;
 	int len;
 	char name[NAME_MAX+1];
 };	
diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h
index 924fb1adab0b..8f8f1bdcca8c 100644
--- a/include/uapi/linux/auto_fs4.h
+++ b/include/uapi/linux/auto_fs4.h
@@ -108,7 +108,7 @@ enum autofs_notify {
 /* v4 multi expire (via pipe) */
 struct autofs_packet_expire_multi {
 	struct autofs_packet_hdr hdr;
-        autofs_wqt_t wait_queue_token;
+	autofs_wqt_t wait_queue_token;
 	int len;
 	char name[NAME_MAX+1];
 };
-- 
cgit v1.2.3


From 5057dcd0f1aaad57e07e728ba20a99e205c6b9de Mon Sep 17 00:00:00 2001
From: Igor Redko <redkoi@virtuozzo.com>
Date: Thu, 17 Mar 2016 14:19:08 -0700
Subject: virtio_balloon: export 'available' memory to balloon statistics

Add a new field, VIRTIO_BALLOON_S_AVAIL, to virtio_balloon memory
statistics protocol, corresponding to 'Available' in /proc/meminfo.

It indicates to the hypervisor how big the balloon can be inflated
without pushing the guest system to swap.

Signed-off-by: Igor Redko <redkoi@virtuozzo.com>
Signed-off-by: Denis V. Lunev <den@openvz.org>
Reviewed-by: Roman Kagan <rkagan@virtuozzo.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/virtio/virtio_balloon.c     | 6 ++++++
 include/uapi/linux/virtio_balloon.h | 3 ++-
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 0c3691f46575..f2b77dea8d3c 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -30,6 +30,7 @@
 #include <linux/balloon_compaction.h>
 #include <linux/oom.h>
 #include <linux/wait.h>
+#include <linux/mm.h>
 
 /*
  * Balloon device works in 4K page units.  So each page is pointed to by
@@ -229,10 +230,13 @@ static void update_balloon_stats(struct virtio_balloon *vb)
 	unsigned long events[NR_VM_EVENT_ITEMS];
 	struct sysinfo i;
 	int idx = 0;
+	long available;
 
 	all_vm_events(events);
 	si_meminfo(&i);
 
+	available = si_mem_available();
+
 	update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_IN,
 				pages_to_bytes(events[PSWPIN]));
 	update_stat(vb, idx++, VIRTIO_BALLOON_S_SWAP_OUT,
@@ -243,6 +247,8 @@ static void update_balloon_stats(struct virtio_balloon *vb)
 				pages_to_bytes(i.freeram));
 	update_stat(vb, idx++, VIRTIO_BALLOON_S_MEMTOT,
 				pages_to_bytes(i.totalram));
+	update_stat(vb, idx++, VIRTIO_BALLOON_S_AVAIL,
+				pages_to_bytes(available));
 }
 
 /*
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index d7f1cbc3766c..343d7ddefe04 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -51,7 +51,8 @@ struct virtio_balloon_config {
 #define VIRTIO_BALLOON_S_MINFLT   3   /* Number of minor faults */
 #define VIRTIO_BALLOON_S_MEMFREE  4   /* Total amount of free memory */
 #define VIRTIO_BALLOON_S_MEMTOT   5   /* Total amount of memory */
-#define VIRTIO_BALLOON_S_NR       6
+#define VIRTIO_BALLOON_S_AVAIL    6   /* Available memory as in /proc */
+#define VIRTIO_BALLOON_S_NR       7
 
 /*
  * Memory statistics structure.
-- 
cgit v1.2.3


From faeb50b98a337abf7a11375e06a83cda23c8ca67 Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Thu, 17 Mar 2016 14:21:17 -0700
Subject: include/uapi/linux/elf-em.h: remove v850

The v850 port was removed by commits f606ddf42fd4 and 07a887d399b8 in
2008.  These #defines are not used in the current kernel.

Signed-off-by: Rob Landley <rob@landley.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/elf-em.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h
index b56dfcfe922a..c3fdfe79e5cc 100644
--- a/include/uapi/linux/elf-em.h
+++ b/include/uapi/linux/elf-em.h
@@ -30,7 +30,6 @@
 #define EM_X86_64	62	/* AMD x86-64 */
 #define EM_S390		22	/* IBM S/390 */
 #define EM_CRIS		76	/* Axis Communications 32-bit embedded processor */
-#define EM_V850		87	/* NEC v850 */
 #define EM_M32R		88	/* Renesas M32R */
 #define EM_MN10300	89	/* Panasonic/MEI MN10300, AM33 */
 #define EM_OPENRISC     92     /* OpenRISC 32-bit embedded processor */
@@ -50,8 +49,6 @@
  */
 #define EM_ALPHA	0x9026
 
-/* Bogus old v850 magic number, used by old tools. */
-#define EM_CYGNUS_V850	0x9080
 /* Bogus old m32r magic number, used by old tools. */
 #define EM_CYGNUS_M32R	0x9041
 /* This is the old interim value for S/390 architecture */
-- 
cgit v1.2.3


From bc27fb68aaad44dd8f5c34924f05721f0abaeec1 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Thu, 17 Mar 2016 14:22:44 -0700
Subject: include/uapi/linux/byteorder, swab: force inlining of some byteswap
 operations

Sometimes gcc mysteriously doesn't inline
very small functions we expect to be inlined. See

    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

With this .config:
http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os,
the following functions get deinlined many times.
Examples of disassembly:

<get_unaligned_be16> (12 copies, 51 calls):
       66 8b 07                mov    (%rdi),%ax
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       86 e0                   xchg   %ah,%al
       5d                      pop    %rbp
       c3                      retq

<get_unaligned_be32> (12 copies, 135 calls):
       8b 07                   mov    (%rdi),%eax
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       0f c8                   bswap  %eax
       5d                      pop    %rbp
       c3                      retq

<get_unaligned_be64> (2 copies, 20 calls):
       48 8b 07                mov    (%rdi),%rax
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       48 0f c8                bswap  %rax
       5d                      pop    %rbp
       c3                      retq

<__swab16p> (16 copies, 146 calls):
       55                      push   %rbp
       89 f8                   mov    %edi,%eax
       86 e0                   xchg   %ah,%al
       48 89 e5                mov    %rsp,%rbp
       5d                      pop    %rbp
       c3                      retq

<__swab32p> (43 copies, ~560 calls):
       55                      push   %rbp
       89 f8                   mov    %edi,%eax
       0f c8                   bswap  %eax
       48 89 e5                mov    %rsp,%rbp
       5d                      pop    %rbp
       c3                      retq

<__swab64p> (21 copies, 119 calls):
       55                      push   %rbp
       48 89 f8                mov    %rdi,%rax
       48 0f c8                bswap  %rax
       48 89 e5                mov    %rsp,%rbp
       5d                      pop    %rbp
       c3                      retq

<__swab32s> (6 copies, 47 calls):
       8b 07                   mov    (%rdi),%eax
       55                      push   %rbp
       48 89 e5                mov    %rsp,%rbp
       0f c8                   bswap  %eax
       89 07                   mov    %eax,(%rdi)
       5d                      pop    %rbp
       c3                      retq

This patch fixes this via s/inline/__always_inline/.
Code size decrease after the patch is ~4.5k:

    text     data      bss       dec     hex filename
92202377 20826112 36417536 149446025 8e85d89 vmlinux
92197848 20826112 36417536 149441496 8e84bd8 vmlinux5_swap_after

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/uapi/linux/byteorder/big_endian.h    | 24 ++++++++++++------------
 include/uapi/linux/byteorder/little_endian.h | 24 ++++++++++++------------
 include/uapi/linux/swab.h                    | 10 +++++-----
 3 files changed, 29 insertions(+), 29 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/byteorder/big_endian.h b/include/uapi/linux/byteorder/big_endian.h
index 672374450095..cdab17ab907c 100644
--- a/include/uapi/linux/byteorder/big_endian.h
+++ b/include/uapi/linux/byteorder/big_endian.h
@@ -40,51 +40,51 @@
 #define __cpu_to_be16(x) ((__force __be16)(__u16)(x))
 #define __be16_to_cpu(x) ((__force __u16)(__be16)(x))
 
-static inline __le64 __cpu_to_le64p(const __u64 *p)
+static __always_inline __le64 __cpu_to_le64p(const __u64 *p)
 {
 	return (__force __le64)__swab64p(p);
 }
-static inline __u64 __le64_to_cpup(const __le64 *p)
+static __always_inline __u64 __le64_to_cpup(const __le64 *p)
 {
 	return __swab64p((__u64 *)p);
 }
-static inline __le32 __cpu_to_le32p(const __u32 *p)
+static __always_inline __le32 __cpu_to_le32p(const __u32 *p)
 {
 	return (__force __le32)__swab32p(p);
 }
-static inline __u32 __le32_to_cpup(const __le32 *p)
+static __always_inline __u32 __le32_to_cpup(const __le32 *p)
 {
 	return __swab32p((__u32 *)p);
 }
-static inline __le16 __cpu_to_le16p(const __u16 *p)
+static __always_inline __le16 __cpu_to_le16p(const __u16 *p)
 {
 	return (__force __le16)__swab16p(p);
 }
-static inline __u16 __le16_to_cpup(const __le16 *p)
+static __always_inline __u16 __le16_to_cpup(const __le16 *p)
 {
 	return __swab16p((__u16 *)p);
 }
-static inline __be64 __cpu_to_be64p(const __u64 *p)
+static __always_inline __be64 __cpu_to_be64p(const __u64 *p)
 {
 	return (__force __be64)*p;
 }
-static inline __u64 __be64_to_cpup(const __be64 *p)
+static __always_inline __u64 __be64_to_cpup(const __be64 *p)
 {
 	return (__force __u64)*p;
 }
-static inline __be32 __cpu_to_be32p(const __u32 *p)
+static __always_inline __be32 __cpu_to_be32p(const __u32 *p)
 {
 	return (__force __be32)*p;
 }
-static inline __u32 __be32_to_cpup(const __be32 *p)
+static __always_inline __u32 __be32_to_cpup(const __be32 *p)
 {
 	return (__force __u32)*p;
 }
-static inline __be16 __cpu_to_be16p(const __u16 *p)
+static __always_inline __be16 __cpu_to_be16p(const __u16 *p)
 {
 	return (__force __be16)*p;
 }
-static inline __u16 __be16_to_cpup(const __be16 *p)
+static __always_inline __u16 __be16_to_cpup(const __be16 *p)
 {
 	return (__force __u16)*p;
 }
diff --git a/include/uapi/linux/byteorder/little_endian.h b/include/uapi/linux/byteorder/little_endian.h
index d876736a0017..4b93f2b260dd 100644
--- a/include/uapi/linux/byteorder/little_endian.h
+++ b/include/uapi/linux/byteorder/little_endian.h
@@ -40,51 +40,51 @@
 #define __cpu_to_be16(x) ((__force __be16)__swab16((x)))
 #define __be16_to_cpu(x) __swab16((__force __u16)(__be16)(x))
 
-static inline __le64 __cpu_to_le64p(const __u64 *p)
+static __always_inline __le64 __cpu_to_le64p(const __u64 *p)
 {
 	return (__force __le64)*p;
 }
-static inline __u64 __le64_to_cpup(const __le64 *p)
+static __always_inline __u64 __le64_to_cpup(const __le64 *p)
 {
 	return (__force __u64)*p;
 }
-static inline __le32 __cpu_to_le32p(const __u32 *p)
+static __always_inline __le32 __cpu_to_le32p(const __u32 *p)
 {
 	return (__force __le32)*p;
 }
-static inline __u32 __le32_to_cpup(const __le32 *p)
+static __always_inline __u32 __le32_to_cpup(const __le32 *p)
 {
 	return (__force __u32)*p;
 }
-static inline __le16 __cpu_to_le16p(const __u16 *p)
+static __always_inline __le16 __cpu_to_le16p(const __u16 *p)
 {
 	return (__force __le16)*p;
 }
-static inline __u16 __le16_to_cpup(const __le16 *p)
+static __always_inline __u16 __le16_to_cpup(const __le16 *p)
 {
 	return (__force __u16)*p;
 }
-static inline __be64 __cpu_to_be64p(const __u64 *p)
+static __always_inline __be64 __cpu_to_be64p(const __u64 *p)
 {
 	return (__force __be64)__swab64p(p);
 }
-static inline __u64 __be64_to_cpup(const __be64 *p)
+static __always_inline __u64 __be64_to_cpup(const __be64 *p)
 {
 	return __swab64p((__u64 *)p);
 }
-static inline __be32 __cpu_to_be32p(const __u32 *p)
+static __always_inline __be32 __cpu_to_be32p(const __u32 *p)
 {
 	return (__force __be32)__swab32p(p);
 }
-static inline __u32 __be32_to_cpup(const __be32 *p)
+static __always_inline __u32 __be32_to_cpup(const __be32 *p)
 {
 	return __swab32p((__u32 *)p);
 }
-static inline __be16 __cpu_to_be16p(const __u16 *p)
+static __always_inline __be16 __cpu_to_be16p(const __u16 *p)
 {
 	return (__force __be16)__swab16p(p);
 }
-static inline __u16 __be16_to_cpup(const __be16 *p)
+static __always_inline __u16 __be16_to_cpup(const __be16 *p)
 {
 	return __swab16p((__u16 *)p);
 }
diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h
index 0e011eb91b5d..3f10e5317b46 100644
--- a/include/uapi/linux/swab.h
+++ b/include/uapi/linux/swab.h
@@ -151,7 +151,7 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val)
  * __swab16p - return a byteswapped 16-bit value from a pointer
  * @p: pointer to a naturally-aligned 16-bit value
  */
-static inline __u16 __swab16p(const __u16 *p)
+static __always_inline __u16 __swab16p(const __u16 *p)
 {
 #ifdef __arch_swab16p
 	return __arch_swab16p(p);
@@ -164,7 +164,7 @@ static inline __u16 __swab16p(const __u16 *p)
  * __swab32p - return a byteswapped 32-bit value from a pointer
  * @p: pointer to a naturally-aligned 32-bit value
  */
-static inline __u32 __swab32p(const __u32 *p)
+static __always_inline __u32 __swab32p(const __u32 *p)
 {
 #ifdef __arch_swab32p
 	return __arch_swab32p(p);
@@ -177,7 +177,7 @@ static inline __u32 __swab32p(const __u32 *p)
  * __swab64p - return a byteswapped 64-bit value from a pointer
  * @p: pointer to a naturally-aligned 64-bit value
  */
-static inline __u64 __swab64p(const __u64 *p)
+static __always_inline __u64 __swab64p(const __u64 *p)
 {
 #ifdef __arch_swab64p
 	return __arch_swab64p(p);
@@ -232,7 +232,7 @@ static inline void __swab16s(__u16 *p)
  * __swab32s - byteswap a 32-bit value in-place
  * @p: pointer to a naturally-aligned 32-bit value
  */
-static inline void __swab32s(__u32 *p)
+static __always_inline void __swab32s(__u32 *p)
 {
 #ifdef __arch_swab32s
 	__arch_swab32s(p);
@@ -245,7 +245,7 @@ static inline void __swab32s(__u32 *p)
  * __swab64s - byteswap a 64-bit value in-place
  * @p: pointer to a naturally-aligned 64-bit value
  */
-static inline void __swab64s(__u64 *p)
+static __always_inline void __swab64s(__u64 *p)
 {
 #ifdef __arch_swab64s
 	__arch_swab64s(p);
-- 
cgit v1.2.3


From 0b81d0779072696371822e5ed9e7c6292e547024 Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Fri, 15 May 2015 16:26:10 -0700
Subject: fs crypto: move per-file encryption from f2fs tree to fs/crypto

This patch adds the renamed functions moved from the f2fs crypto files.

1. definitions for per-file encryption used by ext4 and f2fs.

2. crypto.c for encrypt/decrypt functions
 a. IO preparation:
  - fscrypt_get_ctx / fscrypt_release_ctx
 b. before IOs:
  - fscrypt_encrypt_page
  - fscrypt_decrypt_page
  - fscrypt_zeroout_range
 c. after IOs:
  - fscrypt_decrypt_bio_pages
  - fscrypt_pullback_bio_page
  - fscrypt_restore_control_page

3. policy.c supporting context management.
 a. For ioctls:
  - fscrypt_process_policy
  - fscrypt_get_policy
 b. For context permission
  - fscrypt_has_permitted_context
  - fscrypt_inherit_context

4. keyinfo.c to handle permissions
  - fscrypt_get_encryption_info
  - fscrypt_free_encryption_info

5. fname.c to support filename encryption
 a. general wrapper functions
  - fscrypt_fname_disk_to_usr
  - fscrypt_fname_usr_to_disk
  - fscrypt_setup_filename
  - fscrypt_free_filename

 b. specific filename handling functions
  - fscrypt_fname_alloc_buffer
  - fscrypt_fname_free_buffer

6. Makefile and Kconfig

Cc: Al Viro <viro@ftp.linux.org.uk>
Signed-off-by: Michael Halcrow <mhalcrow@google.com>
Signed-off-by: Ildar Muslukhov <ildarm@google.com>
Signed-off-by: Uday Savagaonkar <savagaon@google.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/Kconfig               |   2 +
 fs/Makefile              |   1 +
 fs/crypto/Kconfig        |  18 ++
 fs/crypto/Makefile       |   3 +
 fs/crypto/crypto.c       | 556 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/crypto/fname.c        | 427 ++++++++++++++++++++++++++++++++++++
 fs/crypto/keyinfo.c      | 278 ++++++++++++++++++++++++
 fs/crypto/policy.c       | 229 +++++++++++++++++++
 fs/f2fs/Kconfig          |  10 +-
 fs/f2fs/Makefile         |   2 -
 fs/f2fs/crypto.c         | 473 ----------------------------------------
 fs/f2fs/crypto_fname.c   | 446 -------------------------------------
 fs/f2fs/crypto_key.c     | 267 -----------------------
 fs/f2fs/crypto_policy.c  | 210 ------------------
 fs/f2fs/data.c           |  31 ++-
 fs/f2fs/dir.c            |  44 ++--
 fs/f2fs/f2fs.h           | 172 +++------------
 fs/f2fs/f2fs_crypto.h    | 151 -------------
 fs/f2fs/file.c           |  38 ++--
 fs/f2fs/inline.c         |   4 +-
 fs/f2fs/inode.c          |   5 +-
 fs/f2fs/namei.c          |  56 +++--
 fs/f2fs/super.c          |  55 +++--
 include/linux/dcache.h   |   2 +
 include/linux/fs.h       |   8 +
 include/linux/fscrypto.h | 433 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/fs.h  |  18 ++
 27 files changed, 2125 insertions(+), 1814 deletions(-)
 create mode 100644 fs/crypto/Kconfig
 create mode 100644 fs/crypto/Makefile
 create mode 100644 fs/crypto/crypto.c
 create mode 100644 fs/crypto/fname.c
 create mode 100644 fs/crypto/keyinfo.c
 create mode 100644 fs/crypto/policy.c
 delete mode 100644 fs/f2fs/crypto.c
 delete mode 100644 fs/f2fs/crypto_fname.c
 delete mode 100644 fs/f2fs/crypto_key.c
 delete mode 100644 fs/f2fs/crypto_policy.c
 delete mode 100644 fs/f2fs/f2fs_crypto.h
 create mode 100644 include/linux/fscrypto.h

(limited to 'include/uapi/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index 9adee0d7536e..9d757673bf40 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -84,6 +84,8 @@ config MANDATORY_FILE_LOCKING
 
 	  To the best of my knowledge this is dead code that no one cares about.
 
+source "fs/crypto/Kconfig"
+
 source "fs/notify/Kconfig"
 
 source "fs/quota/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 79f522575cba..252c96898a43 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_EVENTFD)		+= eventfd.o
 obj-$(CONFIG_USERFAULTFD)	+= userfaultfd.o
 obj-$(CONFIG_AIO)               += aio.o
 obj-$(CONFIG_FS_DAX)		+= dax.o
+obj-$(CONFIG_FS_ENCRYPTION)	+= crypto/
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)		+= compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)	+= binfmt_aout.o
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
new file mode 100644
index 000000000000..92348faf9865
--- /dev/null
+++ b/fs/crypto/Kconfig
@@ -0,0 +1,18 @@
+config FS_ENCRYPTION
+	tristate "FS Encryption (Per-file encryption)"
+	depends on BLOCK
+	select CRYPTO
+	select CRYPTO_AES
+	select CRYPTO_CBC
+	select CRYPTO_ECB
+	select CRYPTO_XTS
+	select CRYPTO_CTS
+	select CRYPTO_CTR
+	select CRYPTO_SHA256
+	select KEYS
+	select ENCRYPTED_KEYS
+	help
+	  Enable encryption of files and directories.  This
+	  feature is similar to ecryptfs, but it is more memory
+	  efficient since it avoids caching the encrypted and
+	  decrypted pages in the page cache.
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile
new file mode 100644
index 000000000000..f17684c48739
--- /dev/null
+++ b/fs/crypto/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_FS_ENCRYPTION)	+= fscrypto.o
+
+fscrypto-y := crypto.o fname.o policy.o keyinfo.o
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
new file mode 100644
index 000000000000..d45c33157e2b
--- /dev/null
+++ b/fs/crypto/crypto.c
@@ -0,0 +1,556 @@
+/*
+ * This contains encryption functions for per-file encryption.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Michael Halcrow, 2014.
+ *
+ * Filename encryption additions
+ *	Uday Savagaonkar, 2014
+ * Encryption policy handling additions
+ *	Ildar Muslukhov, 2014
+ * Add fscrypt_pullback_bio_page()
+ *	Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ *
+ * The usage of AES-XTS should conform to recommendations in NIST
+ * Special Publication 800-38E and IEEE P1619/D16.
+ */
+
+#include <linux/crypto.h>
+#include <linux/ecryptfs.h>
+#include <linux/pagemap.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/ratelimit.h>
+#include <linux/bio.h>
+#include <linux/dcache.h>
+#include <linux/fscrypto.h>
+
+static unsigned int num_prealloc_crypto_pages = 32;
+static unsigned int num_prealloc_crypto_ctxs = 128;
+
+module_param(num_prealloc_crypto_pages, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_pages,
+		"Number of crypto pages to preallocate");
+module_param(num_prealloc_crypto_ctxs, uint, 0444);
+MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
+		"Number of crypto contexts to preallocate");
+
+static mempool_t *fscrypt_bounce_page_pool = NULL;
+
+static LIST_HEAD(fscrypt_free_ctxs);
+static DEFINE_SPINLOCK(fscrypt_ctx_lock);
+
+static struct workqueue_struct *fscrypt_read_workqueue;
+static DEFINE_MUTEX(fscrypt_init_mutex);
+
+static struct kmem_cache *fscrypt_ctx_cachep;
+struct kmem_cache *fscrypt_info_cachep;
+
+/**
+ * fscrypt_release_ctx() - Releases an encryption context
+ * @ctx: The encryption context to release.
+ *
+ * If the encryption context was allocated from the pre-allocated pool, returns
+ * it to that pool. Else, frees it.
+ *
+ * If there's a bounce page in the context, this frees that.
+ */
+void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
+{
+	unsigned long flags;
+
+	if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) {
+		mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
+		ctx->w.bounce_page = NULL;
+	}
+	ctx->w.control_page = NULL;
+	if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
+		kmem_cache_free(fscrypt_ctx_cachep, ctx);
+	} else {
+		spin_lock_irqsave(&fscrypt_ctx_lock, flags);
+		list_add(&ctx->free_list, &fscrypt_free_ctxs);
+		spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
+	}
+}
+EXPORT_SYMBOL(fscrypt_release_ctx);
+
+/**
+ * fscrypt_get_ctx() - Gets an encryption context
+ * @inode:       The inode for which we are doing the crypto
+ *
+ * Allocates and initializes an encryption context.
+ *
+ * Return: An allocated and initialized encryption context on success; error
+ * value or NULL otherwise.
+ */
+struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode)
+{
+	struct fscrypt_ctx *ctx = NULL;
+	struct fscrypt_info *ci = inode->i_crypt_info;
+	unsigned long flags;
+
+	if (ci == NULL)
+		return ERR_PTR(-ENOKEY);
+
+	/*
+	 * We first try getting the ctx from a free list because in
+	 * the common case the ctx will have an allocated and
+	 * initialized crypto tfm, so it's probably a worthwhile
+	 * optimization. For the bounce page, we first try getting it
+	 * from the kernel allocator because that's just about as fast
+	 * as getting it from a list and because a cache of free pages
+	 * should generally be a "last resort" option for a filesystem
+	 * to be able to do its job.
+	 */
+	spin_lock_irqsave(&fscrypt_ctx_lock, flags);
+	ctx = list_first_entry_or_null(&fscrypt_free_ctxs,
+					struct fscrypt_ctx, free_list);
+	if (ctx)
+		list_del(&ctx->free_list);
+	spin_unlock_irqrestore(&fscrypt_ctx_lock, flags);
+	if (!ctx) {
+		ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS);
+		if (!ctx)
+			return ERR_PTR(-ENOMEM);
+		ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+	} else {
+		ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
+	}
+	ctx->flags &= ~FS_WRITE_PATH_FL;
+	return ctx;
+}
+EXPORT_SYMBOL(fscrypt_get_ctx);
+
+/**
+ * fscrypt_complete() - The completion callback for page encryption
+ * @req: The asynchronous encryption request context
+ * @res: The result of the encryption operation
+ */
+static void fscrypt_complete(struct crypto_async_request *req, int res)
+{
+	struct fscrypt_completion_result *ecr = req->data;
+
+	if (res == -EINPROGRESS)
+		return;
+	ecr->res = res;
+	complete(&ecr->completion);
+}
+
+typedef enum {
+	FS_DECRYPT = 0,
+	FS_ENCRYPT,
+} fscrypt_direction_t;
+
+static int do_page_crypto(struct inode *inode,
+			fscrypt_direction_t rw, pgoff_t index,
+			struct page *src_page, struct page *dest_page)
+{
+	u8 xts_tweak[FS_XTS_TWEAK_SIZE];
+	struct ablkcipher_request *req = NULL;
+	DECLARE_FS_COMPLETION_RESULT(ecr);
+	struct scatterlist dst, src;
+	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+	int res = 0;
+
+	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		printk_ratelimited(KERN_ERR
+				"%s: crypto_request_alloc() failed\n",
+				__func__);
+		return -ENOMEM;
+	}
+
+	ablkcipher_request_set_callback(
+		req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+		fscrypt_complete, &ecr);
+
+	BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index));
+	memcpy(xts_tweak, &inode->i_ino, sizeof(index));
+	memset(&xts_tweak[sizeof(index)], 0,
+			FS_XTS_TWEAK_SIZE - sizeof(index));
+
+	sg_init_table(&dst, 1);
+	sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
+	sg_init_table(&src, 1);
+	sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
+	ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
+					xts_tweak);
+	if (rw == FS_DECRYPT)
+		res = crypto_ablkcipher_decrypt(req);
+	else
+		res = crypto_ablkcipher_encrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		BUG_ON(req->base.data != &ecr);
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+	ablkcipher_request_free(req);
+	if (res) {
+		printk_ratelimited(KERN_ERR
+			"%s: crypto_ablkcipher_encrypt() returned %d\n",
+			__func__, res);
+		return res;
+	}
+	return 0;
+}
+
+static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx)
+{
+	ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool,
+							GFP_NOWAIT);
+	if (ctx->w.bounce_page == NULL)
+		return ERR_PTR(-ENOMEM);
+	ctx->flags |= FS_WRITE_PATH_FL;
+	return ctx->w.bounce_page;
+}
+
+/**
+ * fscypt_encrypt_page() - Encrypts a page
+ * @inode:          The inode for which the encryption should take place
+ * @plaintext_page: The page to encrypt. Must be locked.
+ *
+ * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
+ * encryption context.
+ *
+ * Called on the page write path.  The caller must call
+ * fscrypt_restore_control_page() on the returned ciphertext page to
+ * release the bounce buffer and the encryption context.
+ *
+ * Return: An allocated page with the encrypted content on success. Else, an
+ * error value or NULL.
+ */
+struct page *fscrypt_encrypt_page(struct inode *inode,
+				struct page *plaintext_page)
+{
+	struct fscrypt_ctx *ctx;
+	struct page *ciphertext_page = NULL;
+	int err;
+
+	BUG_ON(!PageLocked(plaintext_page));
+
+	ctx = fscrypt_get_ctx(inode);
+	if (IS_ERR(ctx))
+		return (struct page *)ctx;
+
+	/* The encryption operation will require a bounce page. */
+	ciphertext_page = alloc_bounce_page(ctx);
+	if (IS_ERR(ciphertext_page))
+		goto errout;
+
+	ctx->w.control_page = plaintext_page;
+	err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
+					plaintext_page, ciphertext_page);
+	if (err) {
+		ciphertext_page = ERR_PTR(err);
+		goto errout;
+	}
+	SetPagePrivate(ciphertext_page);
+	set_page_private(ciphertext_page, (unsigned long)ctx);
+	lock_page(ciphertext_page);
+	return ciphertext_page;
+
+errout:
+	fscrypt_release_ctx(ctx);
+	return ciphertext_page;
+}
+EXPORT_SYMBOL(fscrypt_encrypt_page);
+
+/**
+ * f2crypt_decrypt_page() - Decrypts a page in-place
+ * @page: The page to decrypt. Must be locked.
+ *
+ * Decrypts page in-place using the ctx encryption context.
+ *
+ * Called from the read completion callback.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int fscrypt_decrypt_page(struct page *page)
+{
+	BUG_ON(!PageLocked(page));
+
+	return do_page_crypto(page->mapping->host,
+			FS_DECRYPT, page->index, page, page);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_page);
+
+int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
+				sector_t pblk, unsigned int len)
+{
+	struct fscrypt_ctx *ctx;
+	struct page *ciphertext_page = NULL;
+	struct bio *bio;
+	int ret, err = 0;
+
+	BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
+
+	ctx = fscrypt_get_ctx(inode);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ciphertext_page = alloc_bounce_page(ctx);
+	if (IS_ERR(ciphertext_page)) {
+		err = PTR_ERR(ciphertext_page);
+		goto errout;
+	}
+
+	while (len--) {
+		err = do_page_crypto(inode, FS_ENCRYPT, lblk,
+						ZERO_PAGE(0), ciphertext_page);
+		if (err)
+			goto errout;
+
+		bio = bio_alloc(GFP_KERNEL, 1);
+		if (!bio) {
+			err = -ENOMEM;
+			goto errout;
+		}
+		bio->bi_bdev = inode->i_sb->s_bdev;
+		bio->bi_iter.bi_sector =
+			pblk << (inode->i_sb->s_blocksize_bits - 9);
+		ret = bio_add_page(bio, ciphertext_page,
+					inode->i_sb->s_blocksize, 0);
+		if (ret != inode->i_sb->s_blocksize) {
+			/* should never happen! */
+			WARN_ON(1);
+			bio_put(bio);
+			err = -EIO;
+			goto errout;
+		}
+		err = submit_bio_wait(WRITE, bio);
+		if ((err == 0) && bio->bi_error)
+			err = -EIO;
+		bio_put(bio);
+		if (err)
+			goto errout;
+		lblk++;
+		pblk++;
+	}
+	err = 0;
+errout:
+	fscrypt_release_ctx(ctx);
+	return err;
+}
+EXPORT_SYMBOL(fscrypt_zeroout_range);
+
+/*
+ * Validate dentries for encrypted directories to make sure we aren't
+ * potentially caching stale data after a key has been added or
+ * removed.
+ */
+static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	struct fscrypt_info *ci = dir->i_crypt_info;
+	int dir_has_key, cached_with_key;
+
+	if (!dir->i_sb->s_cop->is_encrypted(dir))
+		return 0;
+
+	if (ci && ci->ci_keyring_key &&
+	    (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+					  (1 << KEY_FLAG_REVOKED) |
+					  (1 << KEY_FLAG_DEAD))))
+		ci = NULL;
+
+	/* this should eventually be an flag in d_flags */
+	spin_lock(&dentry->d_lock);
+	cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY;
+	spin_unlock(&dentry->d_lock);
+	dir_has_key = (ci != NULL);
+
+	/*
+	 * If the dentry was cached without the key, and it is a
+	 * negative dentry, it might be a valid name.  We can't check
+	 * if the key has since been made available due to locking
+	 * reasons, so we fail the validation so ext4_lookup() can do
+	 * this check.
+	 *
+	 * We also fail the validation if the dentry was created with
+	 * the key present, but we no longer have the key, or vice versa.
+	 */
+	if ((!cached_with_key && d_is_negative(dentry)) ||
+			(!cached_with_key && dir_has_key) ||
+			(cached_with_key && !dir_has_key))
+		return 0;
+	return 1;
+}
+
+const struct dentry_operations fscrypt_d_ops = {
+	.d_revalidate = fscrypt_d_revalidate,
+};
+EXPORT_SYMBOL(fscrypt_d_ops);
+
+/*
+ * Call fscrypt_decrypt_page on every single page, reusing the encryption
+ * context.
+ */
+static void completion_pages(struct work_struct *work)
+{
+	struct fscrypt_ctx *ctx =
+		container_of(work, struct fscrypt_ctx, r.work);
+	struct bio *bio = ctx->r.bio;
+	struct bio_vec *bv;
+	int i;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		struct page *page = bv->bv_page;
+		int ret = fscrypt_decrypt_page(page);
+
+		if (ret) {
+			WARN_ON_ONCE(1);
+			SetPageError(page);
+		} else {
+			SetPageUptodate(page);
+		}
+		unlock_page(page);
+	}
+	fscrypt_release_ctx(ctx);
+	bio_put(bio);
+}
+
+void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio)
+{
+	INIT_WORK(&ctx->r.work, completion_pages);
+	ctx->r.bio = bio;
+	queue_work(fscrypt_read_workqueue, &ctx->r.work);
+}
+EXPORT_SYMBOL(fscrypt_decrypt_bio_pages);
+
+void fscrypt_pullback_bio_page(struct page **page, bool restore)
+{
+	struct fscrypt_ctx *ctx;
+	struct page *bounce_page;
+
+	/* The bounce data pages are unmapped. */
+	if ((*page)->mapping)
+		return;
+
+	/* The bounce data page is unmapped. */
+	bounce_page = *page;
+	ctx = (struct fscrypt_ctx *)page_private(bounce_page);
+
+	/* restore control page */
+	*page = ctx->w.control_page;
+
+	if (restore)
+		fscrypt_restore_control_page(bounce_page);
+}
+EXPORT_SYMBOL(fscrypt_pullback_bio_page);
+
+void fscrypt_restore_control_page(struct page *page)
+{
+	struct fscrypt_ctx *ctx;
+
+	ctx = (struct fscrypt_ctx *)page_private(page);
+	set_page_private(page, (unsigned long)NULL);
+	ClearPagePrivate(page);
+	unlock_page(page);
+	fscrypt_release_ctx(ctx);
+}
+EXPORT_SYMBOL(fscrypt_restore_control_page);
+
+static void fscrypt_destroy(void)
+{
+	struct fscrypt_ctx *pos, *n;
+
+	list_for_each_entry_safe(pos, n, &fscrypt_free_ctxs, free_list)
+		kmem_cache_free(fscrypt_ctx_cachep, pos);
+	INIT_LIST_HEAD(&fscrypt_free_ctxs);
+	mempool_destroy(fscrypt_bounce_page_pool);
+	fscrypt_bounce_page_pool = NULL;
+}
+
+/**
+ * fscrypt_initialize() - allocate major buffers for fs encryption.
+ *
+ * We only call this when we start accessing encrypted files, since it
+ * results in memory getting allocated that wouldn't otherwise be used.
+ *
+ * Return: Zero on success, non-zero otherwise.
+ */
+int fscrypt_initialize(void)
+{
+	int i, res = -ENOMEM;
+
+	if (fscrypt_bounce_page_pool)
+		return 0;
+
+	mutex_lock(&fscrypt_init_mutex);
+	if (fscrypt_bounce_page_pool)
+		goto already_initialized;
+
+	for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
+		struct fscrypt_ctx *ctx;
+
+		ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS);
+		if (!ctx)
+			goto fail;
+		list_add(&ctx->free_list, &fscrypt_free_ctxs);
+	}
+
+	fscrypt_bounce_page_pool =
+		mempool_create_page_pool(num_prealloc_crypto_pages, 0);
+	if (!fscrypt_bounce_page_pool)
+		goto fail;
+
+already_initialized:
+	mutex_unlock(&fscrypt_init_mutex);
+	return 0;
+fail:
+	fscrypt_destroy();
+	mutex_unlock(&fscrypt_init_mutex);
+	return res;
+}
+EXPORT_SYMBOL(fscrypt_initialize);
+
+/**
+ * fscrypt_init() - Set up for fs encryption.
+ */
+static int __init fscrypt_init(void)
+{
+	fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue",
+							WQ_HIGHPRI, 0);
+	if (!fscrypt_read_workqueue)
+		goto fail;
+
+	fscrypt_ctx_cachep = KMEM_CACHE(fscrypt_ctx, SLAB_RECLAIM_ACCOUNT);
+	if (!fscrypt_ctx_cachep)
+		goto fail_free_queue;
+
+	fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT);
+	if (!fscrypt_info_cachep)
+		goto fail_free_ctx;
+
+	return 0;
+
+fail_free_ctx:
+	kmem_cache_destroy(fscrypt_ctx_cachep);
+fail_free_queue:
+	destroy_workqueue(fscrypt_read_workqueue);
+fail:
+	return -ENOMEM;
+}
+module_init(fscrypt_init)
+
+/**
+ * fscrypt_exit() - Shutdown the fs encryption system
+ */
+static void __exit fscrypt_exit(void)
+{
+	fscrypt_destroy();
+
+	if (fscrypt_read_workqueue)
+		destroy_workqueue(fscrypt_read_workqueue);
+	kmem_cache_destroy(fscrypt_ctx_cachep);
+	kmem_cache_destroy(fscrypt_info_cachep);
+}
+module_exit(fscrypt_exit);
+
+MODULE_LICENSE("GPL");
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
new file mode 100644
index 000000000000..5e4ddeeba267
--- /dev/null
+++ b/fs/crypto/fname.c
@@ -0,0 +1,427 @@
+/*
+ * This contains functions for filename crypto management
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility
+ *
+ * Written by Uday Savagaonkar, 2014.
+ * Modified by Jaegeuk Kim, 2015.
+ *
+ * This has not yet undergone a rigorous security audit.
+ */
+
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <linux/ratelimit.h>
+#include <linux/fscrypto.h>
+
+static u32 size_round_up(size_t size, size_t blksize)
+{
+	return ((size + blksize - 1) / blksize) * blksize;
+}
+
+/**
+ * dir_crypt_complete() -
+ */
+static void dir_crypt_complete(struct crypto_async_request *req, int res)
+{
+	struct fscrypt_completion_result *ecr = req->data;
+
+	if (res == -EINPROGRESS)
+		return;
+	ecr->res = res;
+	complete(&ecr->completion);
+}
+
+/**
+ * fname_encrypt() -
+ *
+ * This function encrypts the input filename, and returns the length of the
+ * ciphertext. Errors are returned as negative numbers.  We trust the caller to
+ * allocate sufficient memory to oname string.
+ */
+static int fname_encrypt(struct inode *inode,
+			const struct qstr *iname, struct fscrypt_str *oname)
+{
+	u32 ciphertext_len;
+	struct ablkcipher_request *req = NULL;
+	DECLARE_FS_COMPLETION_RESULT(ecr);
+	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+	int res = 0;
+	char iv[FS_CRYPTO_BLOCK_SIZE];
+	struct scatterlist src_sg, dst_sg;
+	int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
+	char *workbuf, buf[32], *alloc_buf = NULL;
+	unsigned lim;
+
+	lim = inode->i_sb->s_cop->max_namelen(inode);
+	if (iname->len <= 0 || iname->len > lim)
+		return -EIO;
+
+	ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ?
+					FS_CRYPTO_BLOCK_SIZE : iname->len;
+	ciphertext_len = size_round_up(ciphertext_len, padding);
+	ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len;
+
+	if (ciphertext_len <= sizeof(buf)) {
+		workbuf = buf;
+	} else {
+		alloc_buf = kmalloc(ciphertext_len, GFP_NOFS);
+		if (!alloc_buf)
+			return -ENOMEM;
+		workbuf = alloc_buf;
+	}
+
+	/* Allocate request */
+	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		printk_ratelimited(KERN_ERR
+			"%s: crypto_request_alloc() failed\n", __func__);
+		kfree(alloc_buf);
+		return -ENOMEM;
+	}
+	ablkcipher_request_set_callback(req,
+			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+			dir_crypt_complete, &ecr);
+
+	/* Copy the input */
+	memcpy(workbuf, iname->name, iname->len);
+	if (iname->len < ciphertext_len)
+		memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
+
+	/* Initialize IV */
+	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
+
+	/* Create encryption request */
+	sg_init_one(&src_sg, workbuf, ciphertext_len);
+	sg_init_one(&dst_sg, oname->name, ciphertext_len);
+	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
+	res = crypto_ablkcipher_encrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+	kfree(alloc_buf);
+	ablkcipher_request_free(req);
+	if (res < 0)
+		printk_ratelimited(KERN_ERR
+				"%s: Error (error code %d)\n", __func__, res);
+
+	oname->len = ciphertext_len;
+	return res;
+}
+
+/*
+ * fname_decrypt()
+ *	This function decrypts the input filename, and returns
+ *	the length of the plaintext.
+ *	Errors are returned as negative numbers.
+ *	We trust the caller to allocate sufficient memory to oname string.
+ */
+static int fname_decrypt(struct inode *inode,
+				const struct fscrypt_str *iname,
+				struct fscrypt_str *oname)
+{
+	struct ablkcipher_request *req = NULL;
+	DECLARE_FS_COMPLETION_RESULT(ecr);
+	struct scatterlist src_sg, dst_sg;
+	struct fscrypt_info *ci = inode->i_crypt_info;
+	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
+	int res = 0;
+	char iv[FS_CRYPTO_BLOCK_SIZE];
+	unsigned lim;
+
+	lim = inode->i_sb->s_cop->max_namelen(inode);
+	if (iname->len <= 0 || iname->len > lim)
+		return -EIO;
+
+	/* Allocate request */
+	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		printk_ratelimited(KERN_ERR
+			"%s: crypto_request_alloc() failed\n",  __func__);
+		return -ENOMEM;
+	}
+	ablkcipher_request_set_callback(req,
+		CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+		dir_crypt_complete, &ecr);
+
+	/* Initialize IV */
+	memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
+
+	/* Create decryption request */
+	sg_init_one(&src_sg, iname->name, iname->len);
+	sg_init_one(&dst_sg, oname->name, oname->len);
+	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
+	res = crypto_ablkcipher_decrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+	ablkcipher_request_free(req);
+	if (res < 0) {
+		printk_ratelimited(KERN_ERR
+				"%s: Error (error code %d)\n", __func__, res);
+		return res;
+	}
+
+	oname->len = strnlen(oname->name, iname->len);
+	return oname->len;
+}
+
+static const char *lookup_table =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
+
+/**
+ * digest_encode() -
+ *
+ * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
+ * The encoded string is roughly 4/3 times the size of the input string.
+ */
+static int digest_encode(const char *src, int len, char *dst)
+{
+	int i = 0, bits = 0, ac = 0;
+	char *cp = dst;
+
+	while (i < len) {
+		ac += (((unsigned char) src[i]) << bits);
+		bits += 8;
+		do {
+			*cp++ = lookup_table[ac & 0x3f];
+			ac >>= 6;
+			bits -= 6;
+		} while (bits >= 6);
+		i++;
+	}
+	if (bits)
+		*cp++ = lookup_table[ac & 0x3f];
+	return cp - dst;
+}
+
+static int digest_decode(const char *src, int len, char *dst)
+{
+	int i = 0, bits = 0, ac = 0;
+	const char *p;
+	char *cp = dst;
+
+	while (i < len) {
+		p = strchr(lookup_table, src[i]);
+		if (p == NULL || src[i] == 0)
+			return -2;
+		ac += (p - lookup_table) << bits;
+		bits += 6;
+		if (bits >= 8) {
+			*cp++ = ac & 0xff;
+			ac >>= 8;
+			bits -= 8;
+		}
+		i++;
+	}
+	if (ac)
+		return -1;
+	return cp - dst;
+}
+
+u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
+{
+	int padding = 32;
+	struct fscrypt_info *ci = inode->i_crypt_info;
+
+	if (ci)
+		padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK);
+	if (ilen < FS_CRYPTO_BLOCK_SIZE)
+		ilen = FS_CRYPTO_BLOCK_SIZE;
+	return size_round_up(ilen, padding);
+}
+EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
+
+/**
+ * fscrypt_fname_crypto_alloc_obuff() -
+ *
+ * Allocates an output buffer that is sufficient for the crypto operation
+ * specified by the context and the direction.
+ */
+int fscrypt_fname_alloc_buffer(struct inode *inode,
+				u32 ilen, struct fscrypt_str *crypto_str)
+{
+	unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
+
+	crypto_str->len = olen;
+	if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
+		olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
+	/*
+	 * Allocated buffer can hold one more character to null-terminate the
+	 * string
+	 */
+	crypto_str->name = kmalloc(olen + 1, GFP_NOFS);
+	if (!(crypto_str->name))
+		return -ENOMEM;
+	return 0;
+}
+EXPORT_SYMBOL(fscrypt_fname_alloc_buffer);
+
+/**
+ * fscrypt_fname_crypto_free_buffer() -
+ *
+ * Frees the buffer allocated for crypto operation.
+ */
+void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
+{
+	if (!crypto_str)
+		return;
+	kfree(crypto_str->name);
+	crypto_str->name = NULL;
+}
+EXPORT_SYMBOL(fscrypt_fname_free_buffer);
+
+/**
+ * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user
+ * space
+ */
+int fscrypt_fname_disk_to_usr(struct inode *inode,
+			u32 hash, u32 minor_hash,
+			const struct fscrypt_str *iname,
+			struct fscrypt_str *oname)
+{
+	const struct qstr qname = FSTR_TO_QSTR(iname);
+	char buf[24];
+	int ret;
+
+	if (fscrypt_is_dot_dotdot(&qname)) {
+		oname->name[0] = '.';
+		oname->name[iname->len - 1] = '.';
+		oname->len = iname->len;
+		return oname->len;
+	}
+
+	if (iname->len < FS_CRYPTO_BLOCK_SIZE)
+		return -EUCLEAN;
+
+	if (inode->i_crypt_info)
+		return fname_decrypt(inode, iname, oname);
+
+	if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) {
+		ret = digest_encode(iname->name, iname->len, oname->name);
+		oname->len = ret;
+		return ret;
+	}
+	if (hash) {
+		memcpy(buf, &hash, 4);
+		memcpy(buf + 4, &minor_hash, 4);
+	} else {
+		memset(buf, 0, 8);
+	}
+	memcpy(buf + 8, iname->name + iname->len - 16, 16);
+	oname->name[0] = '_';
+	ret = digest_encode(buf, 24, oname->name + 1);
+	oname->len = ret + 1;
+	return ret + 1;
+}
+EXPORT_SYMBOL(fscrypt_fname_disk_to_usr);
+
+/**
+ * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk
+ * space
+ */
+int fscrypt_fname_usr_to_disk(struct inode *inode,
+			const struct qstr *iname,
+			struct fscrypt_str *oname)
+{
+	if (fscrypt_is_dot_dotdot(iname)) {
+		oname->name[0] = '.';
+		oname->name[iname->len - 1] = '.';
+		oname->len = iname->len;
+		return oname->len;
+	}
+	if (inode->i_crypt_info)
+		return fname_encrypt(inode, iname, oname);
+	/*
+	 * Without a proper key, a user is not allowed to modify the filenames
+	 * in a directory. Consequently, a user space name cannot be mapped to
+	 * a disk-space name
+	 */
+	return -EACCES;
+}
+EXPORT_SYMBOL(fscrypt_fname_usr_to_disk);
+
+int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
+			      int lookup, struct fscrypt_name *fname)
+{
+	int ret = 0, bigname = 0;
+
+	memset(fname, 0, sizeof(struct fscrypt_name));
+	fname->usr_fname = iname;
+
+	if (!dir->i_sb->s_cop->is_encrypted(dir) ||
+				fscrypt_is_dot_dotdot(iname)) {
+		fname->disk_name.name = (unsigned char *)iname->name;
+		fname->disk_name.len = iname->len;
+		return 0;
+	}
+	ret = get_crypt_info(dir);
+	if (ret && ret != -EOPNOTSUPP)
+		return ret;
+
+	if (dir->i_crypt_info) {
+		ret = fscrypt_fname_alloc_buffer(dir, iname->len,
+							&fname->crypto_buf);
+		if (ret < 0)
+			return ret;
+		ret = fname_encrypt(dir, iname, &fname->crypto_buf);
+		if (ret < 0)
+			goto errout;
+		fname->disk_name.name = fname->crypto_buf.name;
+		fname->disk_name.len = fname->crypto_buf.len;
+		return 0;
+	}
+	if (!lookup)
+		return -EACCES;
+
+	/*
+	 * We don't have the key and we are doing a lookup; decode the
+	 * user-supplied name
+	 */
+	if (iname->name[0] == '_')
+		bigname = 1;
+	if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43)))
+		return -ENOENT;
+
+	fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
+	if (fname->crypto_buf.name == NULL)
+		return -ENOMEM;
+
+	ret = digest_decode(iname->name + bigname, iname->len - bigname,
+				fname->crypto_buf.name);
+	if (ret < 0) {
+		ret = -ENOENT;
+		goto errout;
+	}
+	fname->crypto_buf.len = ret;
+	if (bigname) {
+		memcpy(&fname->hash, fname->crypto_buf.name, 4);
+		memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4);
+	} else {
+		fname->disk_name.name = fname->crypto_buf.name;
+		fname->disk_name.len = fname->crypto_buf.len;
+	}
+	return 0;
+
+errout:
+	fscrypt_fname_free_buffer(&fname->crypto_buf);
+	return ret;
+}
+EXPORT_SYMBOL(fscrypt_setup_filename);
+
+void fscrypt_free_filename(struct fscrypt_name *fname)
+{
+	kfree(fname->crypto_buf.name);
+	fname->crypto_buf.name = NULL;
+	fname->usr_fname = NULL;
+	fname->disk_name.name = NULL;
+}
+EXPORT_SYMBOL(fscrypt_free_filename);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
new file mode 100644
index 000000000000..cb618425b73c
--- /dev/null
+++ b/fs/crypto/keyinfo.c
@@ -0,0 +1,278 @@
+/*
+ * key management facility for FS encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions.
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#include <keys/encrypted-type.h>
+#include <keys/user-type.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <uapi/linux/keyctl.h>
+#include <crypto/hash.h>
+#include <linux/fscrypto.h>
+
+static void derive_crypt_complete(struct crypto_async_request *req, int rc)
+{
+	struct fscrypt_completion_result *ecr = req->data;
+
+	if (rc == -EINPROGRESS)
+		return;
+
+	ecr->res = rc;
+	complete(&ecr->completion);
+}
+
+/**
+ * derive_key_aes() - Derive a key using AES-128-ECB
+ * @deriving_key: Encryption key used for derivation.
+ * @source_key:   Source key to which to apply derivation.
+ * @derived_key:  Derived key.
+ *
+ * Return: Zero on success; non-zero otherwise.
+ */
+static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
+				u8 source_key[FS_AES_256_XTS_KEY_SIZE],
+				u8 derived_key[FS_AES_256_XTS_KEY_SIZE])
+{
+	int res = 0;
+	struct ablkcipher_request *req = NULL;
+	DECLARE_FS_COMPLETION_RESULT(ecr);
+	struct scatterlist src_sg, dst_sg;
+	struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
+								0);
+
+	if (IS_ERR(tfm)) {
+		res = PTR_ERR(tfm);
+		tfm = NULL;
+		goto out;
+	}
+	crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
+	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
+	if (!req) {
+		res = -ENOMEM;
+		goto out;
+	}
+	ablkcipher_request_set_callback(req,
+			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
+			derive_crypt_complete, &ecr);
+	res = crypto_ablkcipher_setkey(tfm, deriving_key,
+					FS_AES_128_ECB_KEY_SIZE);
+	if (res < 0)
+		goto out;
+
+	sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE);
+	sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE);
+	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
+					FS_AES_256_XTS_KEY_SIZE, NULL);
+	res = crypto_ablkcipher_encrypt(req);
+	if (res == -EINPROGRESS || res == -EBUSY) {
+		wait_for_completion(&ecr.completion);
+		res = ecr.res;
+	}
+out:
+	if (req)
+		ablkcipher_request_free(req);
+	if (tfm)
+		crypto_free_ablkcipher(tfm);
+	return res;
+}
+
+static void put_crypt_info(struct fscrypt_info *ci)
+{
+	if (!ci)
+		return;
+
+	if (ci->ci_keyring_key)
+		key_put(ci->ci_keyring_key);
+	crypto_free_ablkcipher(ci->ci_ctfm);
+	kmem_cache_free(fscrypt_info_cachep, ci);
+}
+
+int get_crypt_info(struct inode *inode)
+{
+	struct fscrypt_info *crypt_info;
+	u8 full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
+				(FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
+	struct key *keyring_key = NULL;
+	struct fscrypt_key *master_key;
+	struct fscrypt_context ctx;
+	const struct user_key_payload *ukp;
+	struct crypto_ablkcipher *ctfm;
+	const char *cipher_str;
+	u8 raw_key[FS_MAX_KEY_SIZE];
+	u8 mode;
+	int res;
+
+	res = fscrypt_initialize();
+	if (res)
+		return res;
+
+	if (!inode->i_sb->s_cop->get_context)
+		return -EOPNOTSUPP;
+retry:
+	crypt_info = ACCESS_ONCE(inode->i_crypt_info);
+	if (crypt_info) {
+		if (!crypt_info->ci_keyring_key ||
+				key_validate(crypt_info->ci_keyring_key) == 0)
+			return 0;
+		fscrypt_put_encryption_info(inode, crypt_info);
+		goto retry;
+	}
+
+	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (res < 0) {
+		if (!fscrypt_dummy_context_enabled(inode))
+			return res;
+		ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
+		ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
+		ctx.flags = 0;
+	} else if (res != sizeof(ctx)) {
+		return -EINVAL;
+	}
+	res = 0;
+
+	crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS);
+	if (!crypt_info)
+		return -ENOMEM;
+
+	crypt_info->ci_flags = ctx.flags;
+	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
+	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
+	crypt_info->ci_ctfm = NULL;
+	crypt_info->ci_keyring_key = NULL;
+	memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
+				sizeof(crypt_info->ci_master_key));
+	if (S_ISREG(inode->i_mode))
+		mode = crypt_info->ci_data_mode;
+	else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
+		mode = crypt_info->ci_filename_mode;
+	else
+		BUG();
+
+	switch (mode) {
+	case FS_ENCRYPTION_MODE_AES_256_XTS:
+		cipher_str = "xts(aes)";
+		break;
+	case FS_ENCRYPTION_MODE_AES_256_CTS:
+		cipher_str = "cts(cbc(aes))";
+		break;
+	default:
+		printk_once(KERN_WARNING
+			    "%s: unsupported key mode %d (ino %u)\n",
+			    __func__, mode, (unsigned) inode->i_ino);
+		res = -ENOKEY;
+		goto out;
+	}
+	if (fscrypt_dummy_context_enabled(inode)) {
+		memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
+		goto got_key;
+	}
+	memcpy(full_key_descriptor, FS_KEY_DESC_PREFIX,
+					FS_KEY_DESC_PREFIX_SIZE);
+	sprintf(full_key_descriptor + FS_KEY_DESC_PREFIX_SIZE,
+					"%*phN", FS_KEY_DESCRIPTOR_SIZE,
+					ctx.master_key_descriptor);
+	full_key_descriptor[FS_KEY_DESC_PREFIX_SIZE +
+					(2 * FS_KEY_DESCRIPTOR_SIZE)] = '\0';
+	keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
+	if (IS_ERR(keyring_key)) {
+		res = PTR_ERR(keyring_key);
+		keyring_key = NULL;
+		goto out;
+	}
+	crypt_info->ci_keyring_key = keyring_key;
+	if (keyring_key->type != &key_type_logon) {
+		printk_once(KERN_WARNING
+				"%s: key type must be logon\n", __func__);
+		res = -ENOKEY;
+		goto out;
+	}
+	down_read(&keyring_key->sem);
+	ukp = user_key_payload(keyring_key);
+	if (ukp->datalen != sizeof(struct fscrypt_key)) {
+		res = -EINVAL;
+		up_read(&keyring_key->sem);
+		goto out;
+	}
+	master_key = (struct fscrypt_key *)ukp->data;
+	BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE);
+
+	if (master_key->size != FS_AES_256_XTS_KEY_SIZE) {
+		printk_once(KERN_WARNING
+				"%s: key size incorrect: %d\n",
+				__func__, master_key->size);
+		res = -ENOKEY;
+		up_read(&keyring_key->sem);
+		goto out;
+	}
+	res = derive_key_aes(ctx.nonce, master_key->raw, raw_key);
+	up_read(&keyring_key->sem);
+	if (res)
+		goto out;
+got_key:
+	ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
+	if (!ctfm || IS_ERR(ctfm)) {
+		res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
+		printk(KERN_DEBUG
+		       "%s: error %d (inode %u) allocating crypto tfm\n",
+		       __func__, res, (unsigned) inode->i_ino);
+		goto out;
+	}
+	crypt_info->ci_ctfm = ctfm;
+	crypto_ablkcipher_clear_flags(ctfm, ~0);
+	crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
+					CRYPTO_TFM_REQ_WEAK_KEY);
+	res = crypto_ablkcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode));
+	if (res)
+		goto out;
+
+	memzero_explicit(raw_key, sizeof(raw_key));
+	if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) {
+		put_crypt_info(crypt_info);
+		goto retry;
+	}
+	return 0;
+
+out:
+	if (res == -ENOKEY)
+		res = 0;
+	put_crypt_info(crypt_info);
+	memzero_explicit(raw_key, sizeof(raw_key));
+	return res;
+}
+
+void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
+{
+	struct fscrypt_info *prev;
+
+	if (ci == NULL)
+		ci = ACCESS_ONCE(inode->i_crypt_info);
+	if (ci == NULL)
+		return;
+
+	prev = cmpxchg(&inode->i_crypt_info, ci, NULL);
+	if (prev != ci)
+		return;
+
+	put_crypt_info(ci);
+}
+EXPORT_SYMBOL(fscrypt_put_encryption_info);
+
+int fscrypt_get_encryption_info(struct inode *inode)
+{
+	struct fscrypt_info *ci = inode->i_crypt_info;
+
+	if (!ci ||
+		(ci->ci_keyring_key &&
+		 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+					       (1 << KEY_FLAG_REVOKED) |
+					       (1 << KEY_FLAG_DEAD)))))
+		return get_crypt_info(inode);
+	return 0;
+}
+EXPORT_SYMBOL(fscrypt_get_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
new file mode 100644
index 000000000000..0f9961eede1e
--- /dev/null
+++ b/fs/crypto/policy.c
@@ -0,0 +1,229 @@
+/*
+ * Encryption policy functions for per-file encryption support.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ * Copyright (C) 2015, Motorola Mobility.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/fscrypto.h>
+
+static int inode_has_encryption_context(struct inode *inode)
+{
+	if (!inode->i_sb->s_cop->get_context)
+		return 0;
+	return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0);
+}
+
+/*
+ * check whether the policy is consistent with the encryption context
+ * for the inode
+ */
+static int is_encryption_context_consistent_with_policy(struct inode *inode,
+				const struct fscrypt_policy *policy)
+{
+	struct fscrypt_context ctx;
+	int res;
+
+	if (!inode->i_sb->s_cop->get_context)
+		return 0;
+
+	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (res != sizeof(ctx))
+		return 0;
+
+	return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
+			FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+			(ctx.flags == policy->flags) &&
+			(ctx.contents_encryption_mode ==
+			 policy->contents_encryption_mode) &&
+			(ctx.filenames_encryption_mode ==
+			 policy->filenames_encryption_mode));
+}
+
+static int create_encryption_context_from_policy(struct inode *inode,
+				const struct fscrypt_policy *policy)
+{
+	struct fscrypt_context ctx;
+	int res;
+
+	if (!inode->i_sb->s_cop->set_context)
+		return -EOPNOTSUPP;
+
+	if (inode->i_sb->s_cop->prepare_context) {
+		res = inode->i_sb->s_cop->prepare_context(inode);
+		if (res)
+			return res;
+	}
+
+	ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+	memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
+					FS_KEY_DESCRIPTOR_SIZE);
+
+	if (!fscrypt_valid_contents_enc_mode(
+				policy->contents_encryption_mode)) {
+		printk(KERN_WARNING
+		       "%s: Invalid contents encryption mode %d\n", __func__,
+			policy->contents_encryption_mode);
+		return -EINVAL;
+	}
+
+	if (!fscrypt_valid_filenames_enc_mode(
+				policy->filenames_encryption_mode)) {
+		printk(KERN_WARNING
+			"%s: Invalid filenames encryption mode %d\n", __func__,
+			policy->filenames_encryption_mode);
+		return -EINVAL;
+	}
+
+	if (policy->flags & ~FS_POLICY_FLAGS_VALID)
+		return -EINVAL;
+
+	ctx.contents_encryption_mode = policy->contents_encryption_mode;
+	ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
+	ctx.flags = policy->flags;
+	BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE);
+	get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+
+	return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
+}
+
+int fscrypt_process_policy(struct inode *inode,
+				const struct fscrypt_policy *policy)
+{
+	if (policy->version != 0)
+		return -EINVAL;
+
+	if (!inode_has_encryption_context(inode)) {
+		if (!inode->i_sb->s_cop->empty_dir)
+			return -EOPNOTSUPP;
+		if (!inode->i_sb->s_cop->empty_dir(inode))
+			return -ENOTEMPTY;
+		return create_encryption_context_from_policy(inode, policy);
+	}
+
+	if (is_encryption_context_consistent_with_policy(inode, policy))
+		return 0;
+
+	printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
+	       __func__);
+	return -EINVAL;
+}
+EXPORT_SYMBOL(fscrypt_process_policy);
+
+int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
+{
+	struct fscrypt_context ctx;
+	int res;
+
+	if (!inode->i_sb->s_cop->get_context ||
+			!inode->i_sb->s_cop->is_encrypted(inode))
+		return -ENODATA;
+
+	res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
+	if (res != sizeof(ctx))
+		return -ENODATA;
+	if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
+		return -EINVAL;
+
+	policy->version = 0;
+	policy->contents_encryption_mode = ctx.contents_encryption_mode;
+	policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
+	policy->flags = ctx.flags;
+	memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+				FS_KEY_DESCRIPTOR_SIZE);
+	return 0;
+}
+EXPORT_SYMBOL(fscrypt_get_policy);
+
+int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
+{
+	struct fscrypt_info *parent_ci, *child_ci;
+	int res;
+
+	if ((parent == NULL) || (child == NULL)) {
+		printk(KERN_ERR	"parent %p child %p\n", parent, child);
+		BUG_ON(1);
+	}
+
+	/* no restrictions if the parent directory is not encrypted */
+	if (!parent->i_sb->s_cop->is_encrypted(parent))
+		return 1;
+	/* if the child directory is not encrypted, this is always a problem */
+	if (!parent->i_sb->s_cop->is_encrypted(child))
+		return 0;
+	res = fscrypt_get_encryption_info(parent);
+	if (res)
+		return 0;
+	res = fscrypt_get_encryption_info(child);
+	if (res)
+		return 0;
+	parent_ci = parent->i_crypt_info;
+	child_ci = child->i_crypt_info;
+	if (!parent_ci && !child_ci)
+		return 1;
+	if (!parent_ci || !child_ci)
+		return 0;
+
+	return (memcmp(parent_ci->ci_master_key,
+			child_ci->ci_master_key,
+			FS_KEY_DESCRIPTOR_SIZE) == 0 &&
+		(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
+		(parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
+		(parent_ci->ci_flags == child_ci->ci_flags));
+}
+EXPORT_SYMBOL(fscrypt_has_permitted_context);
+
+/**
+ * fscrypt_inherit_context() - Sets a child context from its parent
+ * @parent: Parent inode from which the context is inherited.
+ * @child:  Child inode that inherits the context from @parent.
+ * @fs_data:  private data given by FS.
+ * @preload:  preload child i_crypt_info
+ *
+ * Return: Zero on success, non-zero otherwise
+ */
+int fscrypt_inherit_context(struct inode *parent, struct inode *child,
+						void *fs_data, bool preload)
+{
+	struct fscrypt_context ctx;
+	struct fscrypt_info *ci;
+	int res;
+
+	if (!parent->i_sb->s_cop->set_context)
+		return -EOPNOTSUPP;
+
+	res = fscrypt_get_encryption_info(parent);
+	if (res < 0)
+		return res;
+
+	ci = parent->i_crypt_info;
+	if (ci == NULL)
+		return -ENOKEY;
+
+	ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1;
+	if (fscrypt_dummy_context_enabled(parent)) {
+		ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS;
+		ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS;
+		ctx.flags = 0;
+		memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE);
+		res = 0;
+	} else {
+		ctx.contents_encryption_mode = ci->ci_data_mode;
+		ctx.filenames_encryption_mode = ci->ci_filename_mode;
+		ctx.flags = ci->ci_flags;
+		memcpy(ctx.master_key_descriptor, ci->ci_master_key,
+				FS_KEY_DESCRIPTOR_SIZE);
+	}
+	get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE);
+	res = parent->i_sb->s_cop->set_context(child, &ctx,
+						sizeof(ctx), fs_data);
+	if (res)
+		return res;
+	return preload ? fscrypt_get_encryption_info(child): 0;
+}
+EXPORT_SYMBOL(fscrypt_inherit_context);
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index b0a9dc929f88..402792bae503 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -76,15 +76,7 @@ config F2FS_FS_ENCRYPTION
 	bool "F2FS Encryption"
 	depends on F2FS_FS
 	depends on F2FS_FS_XATTR
-	select CRYPTO_AES
-	select CRYPTO_CBC
-	select CRYPTO_ECB
-	select CRYPTO_XTS
-	select CRYPTO_CTS
-	select CRYPTO_CTR
-	select CRYPTO_SHA256
-	select KEYS
-	select ENCRYPTED_KEYS
+	select FS_ENCRYPTION
 	help
 	  Enable encryption of f2fs files and directories.  This
 	  feature is similar to ecryptfs, but it is more memory
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 08e101ed914c..ca949ea7c02f 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -7,5 +7,3 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
 f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
-f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \
-		crypto_key.o crypto_fname.o
diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c
deleted file mode 100644
index 3ef37868d0c7..000000000000
--- a/fs/f2fs/crypto.c
+++ /dev/null
@@ -1,473 +0,0 @@
-/*
- * linux/fs/f2fs/crypto.c
- *
- * Copied from linux/fs/ext4/crypto.c
- *
- * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2015, Motorola Mobility
- *
- * This contains encryption functions for f2fs
- *
- * Written by Michael Halcrow, 2014.
- *
- * Filename encryption additions
- *	Uday Savagaonkar, 2014
- * Encryption policy handling additions
- *	Ildar Muslukhov, 2014
- * Remove ext4_encrypted_zeroout(),
- *   add f2fs_restore_and_release_control_page()
- *	Jaegeuk Kim, 2015.
- *
- * This has not yet undergone a rigorous security audit.
- *
- * The usage of AES-XTS should conform to recommendations in NIST
- * Special Publication 800-38E and IEEE P1619/D16.
- */
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-#include <keys/user-type.h>
-#include <keys/encrypted-type.h>
-#include <linux/crypto.h>
-#include <linux/ecryptfs.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/key.h>
-#include <linux/list.h>
-#include <linux/mempool.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <linux/spinlock_types.h>
-#include <linux/f2fs_fs.h>
-#include <linux/ratelimit.h>
-#include <linux/bio.h>
-
-#include "f2fs.h"
-#include "xattr.h"
-
-/* Encryption added and removed here! (L: */
-
-static unsigned int num_prealloc_crypto_pages = 32;
-static unsigned int num_prealloc_crypto_ctxs = 128;
-
-module_param(num_prealloc_crypto_pages, uint, 0444);
-MODULE_PARM_DESC(num_prealloc_crypto_pages,
-		"Number of crypto pages to preallocate");
-module_param(num_prealloc_crypto_ctxs, uint, 0444);
-MODULE_PARM_DESC(num_prealloc_crypto_ctxs,
-		"Number of crypto contexts to preallocate");
-
-static mempool_t *f2fs_bounce_page_pool;
-
-static LIST_HEAD(f2fs_free_crypto_ctxs);
-static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock);
-
-static struct workqueue_struct *f2fs_read_workqueue;
-static DEFINE_MUTEX(crypto_init);
-
-static struct kmem_cache *f2fs_crypto_ctx_cachep;
-struct kmem_cache *f2fs_crypt_info_cachep;
-
-/**
- * f2fs_release_crypto_ctx() - Releases an encryption context
- * @ctx: The encryption context to release.
- *
- * If the encryption context was allocated from the pre-allocated pool, returns
- * it to that pool. Else, frees it.
- *
- * If there's a bounce page in the context, this frees that.
- */
-void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx)
-{
-	unsigned long flags;
-
-	if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) {
-		mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool);
-		ctx->w.bounce_page = NULL;
-	}
-	ctx->w.control_page = NULL;
-	if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) {
-		kmem_cache_free(f2fs_crypto_ctx_cachep, ctx);
-	} else {
-		spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
-		list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
-		spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
-	}
-}
-
-/**
- * f2fs_get_crypto_ctx() - Gets an encryption context
- * @inode:       The inode for which we are doing the crypto
- *
- * Allocates and initializes an encryption context.
- *
- * Return: An allocated and initialized encryption context on success; error
- * value or NULL otherwise.
- */
-struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode)
-{
-	struct f2fs_crypto_ctx *ctx = NULL;
-	unsigned long flags;
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-
-	if (ci == NULL)
-		return ERR_PTR(-ENOKEY);
-
-	/*
-	 * We first try getting the ctx from a free list because in
-	 * the common case the ctx will have an allocated and
-	 * initialized crypto tfm, so it's probably a worthwhile
-	 * optimization. For the bounce page, we first try getting it
-	 * from the kernel allocator because that's just about as fast
-	 * as getting it from a list and because a cache of free pages
-	 * should generally be a "last resort" option for a filesystem
-	 * to be able to do its job.
-	 */
-	spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags);
-	ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs,
-					struct f2fs_crypto_ctx, free_list);
-	if (ctx)
-		list_del(&ctx->free_list);
-	spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags);
-	if (!ctx) {
-		ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS);
-		if (!ctx)
-			return ERR_PTR(-ENOMEM);
-		ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
-	} else {
-		ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
-	}
-	ctx->flags &= ~F2FS_WRITE_PATH_FL;
-	return ctx;
-}
-
-/*
- * Call f2fs_decrypt on every single page, reusing the encryption
- * context.
- */
-static void completion_pages(struct work_struct *work)
-{
-	struct f2fs_crypto_ctx *ctx =
-		container_of(work, struct f2fs_crypto_ctx, r.work);
-	struct bio *bio = ctx->r.bio;
-	struct bio_vec *bv;
-	int i;
-
-	bio_for_each_segment_all(bv, bio, i) {
-		struct page *page = bv->bv_page;
-		int ret = f2fs_decrypt(page);
-
-		if (ret) {
-			WARN_ON_ONCE(1);
-			SetPageError(page);
-		} else
-			SetPageUptodate(page);
-		unlock_page(page);
-	}
-	f2fs_release_crypto_ctx(ctx);
-	bio_put(bio);
-}
-
-void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio)
-{
-	INIT_WORK(&ctx->r.work, completion_pages);
-	ctx->r.bio = bio;
-	queue_work(f2fs_read_workqueue, &ctx->r.work);
-}
-
-static void f2fs_crypto_destroy(void)
-{
-	struct f2fs_crypto_ctx *pos, *n;
-
-	list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list)
-		kmem_cache_free(f2fs_crypto_ctx_cachep, pos);
-	INIT_LIST_HEAD(&f2fs_free_crypto_ctxs);
-	if (f2fs_bounce_page_pool)
-		mempool_destroy(f2fs_bounce_page_pool);
-	f2fs_bounce_page_pool = NULL;
-}
-
-/**
- * f2fs_crypto_initialize() - Set up for f2fs encryption.
- *
- * We only call this when we start accessing encrypted files, since it
- * results in memory getting allocated that wouldn't otherwise be used.
- *
- * Return: Zero on success, non-zero otherwise.
- */
-int f2fs_crypto_initialize(void)
-{
-	int i, res = -ENOMEM;
-
-	if (f2fs_bounce_page_pool)
-		return 0;
-
-	mutex_lock(&crypto_init);
-	if (f2fs_bounce_page_pool)
-		goto already_initialized;
-
-	for (i = 0; i < num_prealloc_crypto_ctxs; i++) {
-		struct f2fs_crypto_ctx *ctx;
-
-		ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL);
-		if (!ctx)
-			goto fail;
-		list_add(&ctx->free_list, &f2fs_free_crypto_ctxs);
-	}
-
-	/* must be allocated at the last step to avoid race condition above */
-	f2fs_bounce_page_pool =
-		mempool_create_page_pool(num_prealloc_crypto_pages, 0);
-	if (!f2fs_bounce_page_pool)
-		goto fail;
-
-already_initialized:
-	mutex_unlock(&crypto_init);
-	return 0;
-fail:
-	f2fs_crypto_destroy();
-	mutex_unlock(&crypto_init);
-	return res;
-}
-
-/**
- * f2fs_exit_crypto() - Shutdown the f2fs encryption system
- */
-void f2fs_exit_crypto(void)
-{
-	f2fs_crypto_destroy();
-
-	if (f2fs_read_workqueue)
-		destroy_workqueue(f2fs_read_workqueue);
-	if (f2fs_crypto_ctx_cachep)
-		kmem_cache_destroy(f2fs_crypto_ctx_cachep);
-	if (f2fs_crypt_info_cachep)
-		kmem_cache_destroy(f2fs_crypt_info_cachep);
-}
-
-int __init f2fs_init_crypto(void)
-{
-	int res = -ENOMEM;
-
-	f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0);
-	if (!f2fs_read_workqueue)
-		goto fail;
-
-	f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx,
-						SLAB_RECLAIM_ACCOUNT);
-	if (!f2fs_crypto_ctx_cachep)
-		goto fail;
-
-	f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info,
-						SLAB_RECLAIM_ACCOUNT);
-	if (!f2fs_crypt_info_cachep)
-		goto fail;
-
-	return 0;
-fail:
-	f2fs_exit_crypto();
-	return res;
-}
-
-void f2fs_restore_and_release_control_page(struct page **page)
-{
-	struct f2fs_crypto_ctx *ctx;
-	struct page *bounce_page;
-
-	/* The bounce data pages are unmapped. */
-	if ((*page)->mapping)
-		return;
-
-	/* The bounce data page is unmapped. */
-	bounce_page = *page;
-	ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page);
-
-	/* restore control page */
-	*page = ctx->w.control_page;
-
-	f2fs_restore_control_page(bounce_page);
-}
-
-void f2fs_restore_control_page(struct page *data_page)
-{
-	struct f2fs_crypto_ctx *ctx =
-		(struct f2fs_crypto_ctx *)page_private(data_page);
-
-	set_page_private(data_page, (unsigned long)NULL);
-	ClearPagePrivate(data_page);
-	unlock_page(data_page);
-	f2fs_release_crypto_ctx(ctx);
-}
-
-/**
- * f2fs_crypt_complete() - The completion callback for page encryption
- * @req: The asynchronous encryption request context
- * @res: The result of the encryption operation
- */
-static void f2fs_crypt_complete(struct crypto_async_request *req, int res)
-{
-	struct f2fs_completion_result *ecr = req->data;
-
-	if (res == -EINPROGRESS)
-		return;
-	ecr->res = res;
-	complete(&ecr->completion);
-}
-
-typedef enum {
-	F2FS_DECRYPT = 0,
-	F2FS_ENCRYPT,
-} f2fs_direction_t;
-
-static int f2fs_page_crypto(struct inode *inode,
-				f2fs_direction_t rw,
-				pgoff_t index,
-				struct page *src_page,
-				struct page *dest_page)
-{
-	u8 xts_tweak[F2FS_XTS_TWEAK_SIZE];
-	struct ablkcipher_request *req = NULL;
-	DECLARE_F2FS_COMPLETION_RESULT(ecr);
-	struct scatterlist dst, src;
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
-	int res = 0;
-
-	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
-	if (!req) {
-		printk_ratelimited(KERN_ERR
-				"%s: crypto_request_alloc() failed\n",
-				__func__);
-		return -ENOMEM;
-	}
-	ablkcipher_request_set_callback(
-		req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-		f2fs_crypt_complete, &ecr);
-
-	BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index));
-	memcpy(xts_tweak, &index, sizeof(index));
-	memset(&xts_tweak[sizeof(index)], 0,
-			F2FS_XTS_TWEAK_SIZE - sizeof(index));
-
-	sg_init_table(&dst, 1);
-	sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0);
-	sg_init_table(&src, 1);
-	sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0);
-	ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE,
-					xts_tweak);
-	if (rw == F2FS_DECRYPT)
-		res = crypto_ablkcipher_decrypt(req);
-	else
-		res = crypto_ablkcipher_encrypt(req);
-	if (res == -EINPROGRESS || res == -EBUSY) {
-		wait_for_completion(&ecr.completion);
-		res = ecr.res;
-	}
-	ablkcipher_request_free(req);
-	if (res) {
-		printk_ratelimited(KERN_ERR
-			"%s: crypto_ablkcipher_encrypt() returned %d\n",
-			__func__, res);
-		return res;
-	}
-	return 0;
-}
-
-static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx)
-{
-	ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT);
-	if (ctx->w.bounce_page == NULL)
-		return ERR_PTR(-ENOMEM);
-	ctx->flags |= F2FS_WRITE_PATH_FL;
-	return ctx->w.bounce_page;
-}
-
-/**
- * f2fs_encrypt() - Encrypts a page
- * @inode:          The inode for which the encryption should take place
- * @plaintext_page: The page to encrypt. Must be locked.
- *
- * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
- * encryption context.
- *
- * Called on the page write path.  The caller must call
- * f2fs_restore_control_page() on the returned ciphertext page to
- * release the bounce buffer and the encryption context.
- *
- * Return: An allocated page with the encrypted content on success. Else, an
- * error value or NULL.
- */
-struct page *f2fs_encrypt(struct inode *inode,
-			  struct page *plaintext_page)
-{
-	struct f2fs_crypto_ctx *ctx;
-	struct page *ciphertext_page = NULL;
-	int err;
-
-	BUG_ON(!PageLocked(plaintext_page));
-
-	ctx = f2fs_get_crypto_ctx(inode);
-	if (IS_ERR(ctx))
-		return (struct page *)ctx;
-
-	/* The encryption operation will require a bounce page. */
-	ciphertext_page = alloc_bounce_page(ctx);
-	if (IS_ERR(ciphertext_page))
-		goto err_out;
-
-	ctx->w.control_page = plaintext_page;
-	err = f2fs_page_crypto(inode, F2FS_ENCRYPT, plaintext_page->index,
-					plaintext_page, ciphertext_page);
-	if (err) {
-		ciphertext_page = ERR_PTR(err);
-		goto err_out;
-	}
-
-	SetPagePrivate(ciphertext_page);
-	set_page_private(ciphertext_page, (unsigned long)ctx);
-	lock_page(ciphertext_page);
-	return ciphertext_page;
-
-err_out:
-	f2fs_release_crypto_ctx(ctx);
-	return ciphertext_page;
-}
-
-/**
- * f2fs_decrypt() - Decrypts a page in-place
- * @ctx:  The encryption context.
- * @page: The page to decrypt. Must be locked.
- *
- * Decrypts page in-place using the ctx encryption context.
- *
- * Called from the read completion callback.
- *
- * Return: Zero on success, non-zero otherwise.
- */
-int f2fs_decrypt(struct page *page)
-{
-	BUG_ON(!PageLocked(page));
-
-	return f2fs_page_crypto(page->mapping->host,
-				F2FS_DECRYPT, page->index, page, page);
-}
-
-bool f2fs_valid_contents_enc_mode(uint32_t mode)
-{
-	return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS);
-}
-
-/**
- * f2fs_validate_encryption_key_size() - Validate the encryption key size
- * @mode: The key mode.
- * @size: The key size to validate.
- *
- * Return: The validated key size for @mode. Zero if invalid.
- */
-uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size)
-{
-	if (size == f2fs_encryption_key_size(mode))
-		return size;
-	return 0;
-}
diff --git a/fs/f2fs/crypto_fname.c b/fs/f2fs/crypto_fname.c
deleted file mode 100644
index 6dfdc978fe45..000000000000
--- a/fs/f2fs/crypto_fname.c
+++ /dev/null
@@ -1,446 +0,0 @@
-/*
- * linux/fs/f2fs/crypto_fname.c
- *
- * Copied from linux/fs/ext4/crypto.c
- *
- * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2015, Motorola Mobility
- *
- * This contains functions for filename crypto management in f2fs
- *
- * Written by Uday Savagaonkar, 2014.
- *
- * Adjust f2fs dentry structure
- *	Jaegeuk Kim, 2015.
- *
- * This has not yet undergone a rigorous security audit.
- */
-#include <crypto/hash.h>
-#include <crypto/sha.h>
-#include <keys/encrypted-type.h>
-#include <keys/user-type.h>
-#include <linux/crypto.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/key.h>
-#include <linux/list.h>
-#include <linux/mempool.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <linux/spinlock_types.h>
-#include <linux/f2fs_fs.h>
-#include <linux/ratelimit.h>
-
-#include "f2fs.h"
-#include "f2fs_crypto.h"
-#include "xattr.h"
-
-/**
- * f2fs_dir_crypt_complete() -
- */
-static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res)
-{
-	struct f2fs_completion_result *ecr = req->data;
-
-	if (res == -EINPROGRESS)
-		return;
-	ecr->res = res;
-	complete(&ecr->completion);
-}
-
-bool f2fs_valid_filenames_enc_mode(uint32_t mode)
-{
-	return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS);
-}
-
-static unsigned max_name_len(struct inode *inode)
-{
-	return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize :
-					F2FS_NAME_LEN;
-}
-
-/**
- * f2fs_fname_encrypt() -
- *
- * This function encrypts the input filename, and returns the length of the
- * ciphertext. Errors are returned as negative numbers.  We trust the caller to
- * allocate sufficient memory to oname string.
- */
-static int f2fs_fname_encrypt(struct inode *inode,
-			const struct qstr *iname, struct f2fs_str *oname)
-{
-	u32 ciphertext_len;
-	struct ablkcipher_request *req = NULL;
-	DECLARE_F2FS_COMPLETION_RESULT(ecr);
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
-	int res = 0;
-	char iv[F2FS_CRYPTO_BLOCK_SIZE];
-	struct scatterlist src_sg, dst_sg;
-	int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
-	char *workbuf, buf[32], *alloc_buf = NULL;
-	unsigned lim = max_name_len(inode);
-
-	if (iname->len <= 0 || iname->len > lim)
-		return -EIO;
-
-	ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ?
-		F2FS_CRYPTO_BLOCK_SIZE : iname->len;
-	ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding);
-	ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len;
-
-	if (ciphertext_len <= sizeof(buf)) {
-		workbuf = buf;
-	} else {
-		alloc_buf = kmalloc(ciphertext_len, GFP_NOFS);
-		if (!alloc_buf)
-			return -ENOMEM;
-		workbuf = alloc_buf;
-	}
-
-	/* Allocate request */
-	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
-	if (!req) {
-		printk_ratelimited(KERN_ERR
-			"%s: crypto_request_alloc() failed\n", __func__);
-		kfree(alloc_buf);
-		return -ENOMEM;
-	}
-	ablkcipher_request_set_callback(req,
-			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-			f2fs_dir_crypt_complete, &ecr);
-
-	/* Copy the input */
-	memcpy(workbuf, iname->name, iname->len);
-	if (iname->len < ciphertext_len)
-		memset(workbuf + iname->len, 0, ciphertext_len - iname->len);
-
-	/* Initialize IV */
-	memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
-
-	/* Create encryption request */
-	sg_init_one(&src_sg, workbuf, ciphertext_len);
-	sg_init_one(&dst_sg, oname->name, ciphertext_len);
-	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
-	res = crypto_ablkcipher_encrypt(req);
-	if (res == -EINPROGRESS || res == -EBUSY) {
-		wait_for_completion(&ecr.completion);
-		res = ecr.res;
-	}
-	kfree(alloc_buf);
-	ablkcipher_request_free(req);
-	if (res < 0) {
-		printk_ratelimited(KERN_ERR
-				"%s: Error (error code %d)\n", __func__, res);
-	}
-	oname->len = ciphertext_len;
-	return res;
-}
-
-/*
- * f2fs_fname_decrypt()
- *	This function decrypts the input filename, and returns
- *	the length of the plaintext.
- *	Errors are returned as negative numbers.
- *	We trust the caller to allocate sufficient memory to oname string.
- */
-static int f2fs_fname_decrypt(struct inode *inode,
-			const struct f2fs_str *iname, struct f2fs_str *oname)
-{
-	struct ablkcipher_request *req = NULL;
-	DECLARE_F2FS_COMPLETION_RESULT(ecr);
-	struct scatterlist src_sg, dst_sg;
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-	struct crypto_ablkcipher *tfm = ci->ci_ctfm;
-	int res = 0;
-	char iv[F2FS_CRYPTO_BLOCK_SIZE];
-	unsigned lim = max_name_len(inode);
-
-	if (iname->len <= 0 || iname->len > lim)
-		return -EIO;
-
-	/* Allocate request */
-	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
-	if (!req) {
-		printk_ratelimited(KERN_ERR
-			"%s: crypto_request_alloc() failed\n",  __func__);
-		return -ENOMEM;
-	}
-	ablkcipher_request_set_callback(req,
-		CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-		f2fs_dir_crypt_complete, &ecr);
-
-	/* Initialize IV */
-	memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE);
-
-	/* Create decryption request */
-	sg_init_one(&src_sg, iname->name, iname->len);
-	sg_init_one(&dst_sg, oname->name, oname->len);
-	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
-	res = crypto_ablkcipher_decrypt(req);
-	if (res == -EINPROGRESS || res == -EBUSY) {
-		wait_for_completion(&ecr.completion);
-		res = ecr.res;
-	}
-	ablkcipher_request_free(req);
-	if (res < 0) {
-		printk_ratelimited(KERN_ERR
-			"%s: Error in f2fs_fname_decrypt (error code %d)\n",
-			__func__, res);
-		return res;
-	}
-
-	oname->len = strnlen(oname->name, iname->len);
-	return oname->len;
-}
-
-static const char *lookup_table =
-	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,";
-
-/**
- * f2fs_fname_encode_digest() -
- *
- * Encodes the input digest using characters from the set [a-zA-Z0-9_+].
- * The encoded string is roughly 4/3 times the size of the input string.
- */
-static int digest_encode(const char *src, int len, char *dst)
-{
-	int i = 0, bits = 0, ac = 0;
-	char *cp = dst;
-
-	while (i < len) {
-		ac += (((unsigned char) src[i]) << bits);
-		bits += 8;
-		do {
-			*cp++ = lookup_table[ac & 0x3f];
-			ac >>= 6;
-			bits -= 6;
-		} while (bits >= 6);
-		i++;
-	}
-	if (bits)
-		*cp++ = lookup_table[ac & 0x3f];
-	return cp - dst;
-}
-
-static int digest_decode(const char *src, int len, char *dst)
-{
-	int i = 0, bits = 0, ac = 0;
-	const char *p;
-	char *cp = dst;
-
-	while (i < len) {
-		p = strchr(lookup_table, src[i]);
-		if (p == NULL || src[i] == 0)
-			return -2;
-		ac += (p - lookup_table) << bits;
-		bits += 6;
-		if (bits >= 8) {
-			*cp++ = ac & 0xff;
-			ac >>= 8;
-			bits -= 8;
-		}
-		i++;
-	}
-	if (ac)
-		return -1;
-	return cp - dst;
-}
-
-/**
- * f2fs_fname_crypto_round_up() -
- *
- * Return: The next multiple of block size
- */
-u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize)
-{
-	return ((size + blksize - 1) / blksize) * blksize;
-}
-
-unsigned f2fs_fname_encrypted_size(struct inode *inode, u32 ilen)
-{
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-	int padding = 32;
-
-	if (ci)
-		padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK);
-	if (ilen < F2FS_CRYPTO_BLOCK_SIZE)
-		ilen = F2FS_CRYPTO_BLOCK_SIZE;
-	return f2fs_fname_crypto_round_up(ilen, padding);
-}
-
-/**
- * f2fs_fname_crypto_alloc_obuff() -
- *
- * Allocates an output buffer that is sufficient for the crypto operation
- * specified by the context and the direction.
- */
-int f2fs_fname_crypto_alloc_buffer(struct inode *inode,
-				   u32 ilen, struct f2fs_str *crypto_str)
-{
-	unsigned int olen = f2fs_fname_encrypted_size(inode, ilen);
-
-	crypto_str->len = olen;
-	if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2)
-		olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2;
-	/* Allocated buffer can hold one more character to null-terminate the
-	 * string */
-	crypto_str->name = kmalloc(olen + 1, GFP_NOFS);
-	if (!(crypto_str->name))
-		return -ENOMEM;
-	return 0;
-}
-
-/**
- * f2fs_fname_crypto_free_buffer() -
- *
- * Frees the buffer allocated for crypto operation.
- */
-void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str)
-{
-	if (!crypto_str)
-		return;
-	kfree(crypto_str->name);
-	crypto_str->name = NULL;
-}
-
-/**
- * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space
- */
-int f2fs_fname_disk_to_usr(struct inode *inode,
-			f2fs_hash_t *hash,
-			const struct f2fs_str *iname,
-			struct f2fs_str *oname)
-{
-	const struct qstr qname = FSTR_TO_QSTR(iname);
-	char buf[24];
-	int ret;
-
-	if (is_dot_dotdot(&qname)) {
-		oname->name[0] = '.';
-		oname->name[iname->len - 1] = '.';
-		oname->len = iname->len;
-		return oname->len;
-	}
-	if (iname->len < F2FS_CRYPTO_BLOCK_SIZE) {
-		printk("encrypted inode too small");
-		return -EUCLEAN;
-	}
-	if (F2FS_I(inode)->i_crypt_info)
-		return f2fs_fname_decrypt(inode, iname, oname);
-
-	if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) {
-		ret = digest_encode(iname->name, iname->len, oname->name);
-		oname->len = ret;
-		return ret;
-	}
-	if (hash) {
-		memcpy(buf, hash, 4);
-		memset(buf + 4, 0, 4);
-	} else
-		memset(buf, 0, 8);
-	memcpy(buf + 8, iname->name + iname->len - 16, 16);
-	oname->name[0] = '_';
-	ret = digest_encode(buf, 24, oname->name + 1);
-	oname->len = ret + 1;
-	return ret + 1;
-}
-
-/**
- * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space
- */
-int f2fs_fname_usr_to_disk(struct inode *inode,
-			const struct qstr *iname,
-			struct f2fs_str *oname)
-{
-	int res;
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-
-	if (is_dot_dotdot(iname)) {
-		oname->name[0] = '.';
-		oname->name[iname->len - 1] = '.';
-		oname->len = iname->len;
-		return oname->len;
-	}
-
-	if (ci) {
-		res = f2fs_fname_encrypt(inode, iname, oname);
-		return res;
-	}
-	/* Without a proper key, a user is not allowed to modify the filenames
-	 * in a directory. Consequently, a user space name cannot be mapped to
-	 * a disk-space name */
-	return -EACCES;
-}
-
-int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname,
-			      int lookup, struct f2fs_filename *fname)
-{
-	struct f2fs_crypt_info *ci;
-	int ret = 0, bigname = 0;
-
-	memset(fname, 0, sizeof(struct f2fs_filename));
-	fname->usr_fname = iname;
-
-	if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) {
-		fname->disk_name.name = (unsigned char *)iname->name;
-		fname->disk_name.len = iname->len;
-		return 0;
-	}
-	ret = f2fs_get_encryption_info(dir);
-	if (ret)
-		return ret;
-	ci = F2FS_I(dir)->i_crypt_info;
-	if (ci) {
-		ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len,
-						     &fname->crypto_buf);
-		if (ret < 0)
-			return ret;
-		ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf);
-		if (ret < 0)
-			goto errout;
-		fname->disk_name.name = fname->crypto_buf.name;
-		fname->disk_name.len = fname->crypto_buf.len;
-		return 0;
-	}
-	if (!lookup)
-		return -EACCES;
-
-	/* We don't have the key and we are doing a lookup; decode the
-	 * user-supplied name
-	 */
-	if (iname->name[0] == '_')
-		bigname = 1;
-	if ((bigname && (iname->len != 33)) ||
-	    (!bigname && (iname->len > 43)))
-		return -ENOENT;
-
-	fname->crypto_buf.name = kmalloc(32, GFP_KERNEL);
-	if (fname->crypto_buf.name == NULL)
-		return -ENOMEM;
-	ret = digest_decode(iname->name + bigname, iname->len - bigname,
-				fname->crypto_buf.name);
-	if (ret < 0) {
-		ret = -ENOENT;
-		goto errout;
-	}
-	fname->crypto_buf.len = ret;
-	if (bigname) {
-		memcpy(&fname->hash, fname->crypto_buf.name, 4);
-	} else {
-		fname->disk_name.name = fname->crypto_buf.name;
-		fname->disk_name.len = fname->crypto_buf.len;
-	}
-	return 0;
-errout:
-	f2fs_fname_crypto_free_buffer(&fname->crypto_buf);
-	return ret;
-}
-
-void f2fs_fname_free_filename(struct f2fs_filename *fname)
-{
-	kfree(fname->crypto_buf.name);
-	fname->crypto_buf.name = NULL;
-	fname->usr_fname = NULL;
-	fname->disk_name.name = NULL;
-}
diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
deleted file mode 100644
index 9094fca692fb..000000000000
--- a/fs/f2fs/crypto_key.c
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * linux/fs/f2fs/crypto_key.c
- *
- * Copied from linux/fs/f2fs/crypto_key.c
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption key functions for f2fs
- *
- * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
- */
-#include <keys/encrypted-type.h>
-#include <keys/user-type.h>
-#include <linux/random.h>
-#include <linux/scatterlist.h>
-#include <uapi/linux/keyctl.h>
-#include <crypto/hash.h>
-#include <linux/f2fs_fs.h>
-
-#include "f2fs.h"
-#include "xattr.h"
-
-static void derive_crypt_complete(struct crypto_async_request *req, int rc)
-{
-	struct f2fs_completion_result *ecr = req->data;
-
-	if (rc == -EINPROGRESS)
-		return;
-
-	ecr->res = rc;
-	complete(&ecr->completion);
-}
-
-/**
- * f2fs_derive_key_aes() - Derive a key using AES-128-ECB
- * @deriving_key: Encryption key used for derivation.
- * @source_key:   Source key to which to apply derivation.
- * @derived_key:  Derived key.
- *
- * Return: Zero on success; non-zero otherwise.
- */
-static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE],
-				char source_key[F2FS_AES_256_XTS_KEY_SIZE],
-				char derived_key[F2FS_AES_256_XTS_KEY_SIZE])
-{
-	int res = 0;
-	struct ablkcipher_request *req = NULL;
-	DECLARE_F2FS_COMPLETION_RESULT(ecr);
-	struct scatterlist src_sg, dst_sg;
-	struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0,
-								0);
-
-	if (IS_ERR(tfm)) {
-		res = PTR_ERR(tfm);
-		tfm = NULL;
-		goto out;
-	}
-	crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY);
-	req = ablkcipher_request_alloc(tfm, GFP_NOFS);
-	if (!req) {
-		res = -ENOMEM;
-		goto out;
-	}
-	ablkcipher_request_set_callback(req,
-			CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-			derive_crypt_complete, &ecr);
-	res = crypto_ablkcipher_setkey(tfm, deriving_key,
-				F2FS_AES_128_ECB_KEY_SIZE);
-	if (res < 0)
-		goto out;
-
-	sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE);
-	sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE);
-	ablkcipher_request_set_crypt(req, &src_sg, &dst_sg,
-					F2FS_AES_256_XTS_KEY_SIZE, NULL);
-	res = crypto_ablkcipher_encrypt(req);
-	if (res == -EINPROGRESS || res == -EBUSY) {
-		wait_for_completion(&ecr.completion);
-		res = ecr.res;
-	}
-out:
-	if (req)
-		ablkcipher_request_free(req);
-	if (tfm)
-		crypto_free_ablkcipher(tfm);
-	return res;
-}
-
-static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci)
-{
-	if (!ci)
-		return;
-
-	key_put(ci->ci_keyring_key);
-	crypto_free_ablkcipher(ci->ci_ctfm);
-	kmem_cache_free(f2fs_crypt_info_cachep, ci);
-}
-
-void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci)
-{
-	struct f2fs_inode_info *fi = F2FS_I(inode);
-	struct f2fs_crypt_info *prev;
-
-	if (ci == NULL)
-		ci = ACCESS_ONCE(fi->i_crypt_info);
-	if (ci == NULL)
-		return;
-	prev = cmpxchg(&fi->i_crypt_info, ci, NULL);
-	if (prev != ci)
-		return;
-
-	f2fs_free_crypt_info(ci);
-}
-
-int _f2fs_get_encryption_info(struct inode *inode)
-{
-	struct f2fs_inode_info *fi = F2FS_I(inode);
-	struct f2fs_crypt_info *crypt_info;
-	char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
-				(F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1];
-	struct key *keyring_key = NULL;
-	struct f2fs_encryption_key *master_key;
-	struct f2fs_encryption_context ctx;
-	const struct user_key_payload *ukp;
-	struct crypto_ablkcipher *ctfm;
-	const char *cipher_str;
-	char raw_key[F2FS_MAX_KEY_SIZE];
-	char mode;
-	int res;
-
-	res = f2fs_crypto_initialize();
-	if (res)
-		return res;
-retry:
-	crypt_info = ACCESS_ONCE(fi->i_crypt_info);
-	if (crypt_info) {
-		if (!crypt_info->ci_keyring_key ||
-				key_validate(crypt_info->ci_keyring_key) == 0)
-			return 0;
-		f2fs_free_encryption_info(inode, crypt_info);
-		goto retry;
-	}
-
-	res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
-				&ctx, sizeof(ctx), NULL);
-	if (res < 0)
-		return res;
-	else if (res != sizeof(ctx))
-		return -EINVAL;
-	res = 0;
-
-	crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS);
-	if (!crypt_info)
-		return -ENOMEM;
-
-	crypt_info->ci_flags = ctx.flags;
-	crypt_info->ci_data_mode = ctx.contents_encryption_mode;
-	crypt_info->ci_filename_mode = ctx.filenames_encryption_mode;
-	crypt_info->ci_ctfm = NULL;
-	crypt_info->ci_keyring_key = NULL;
-	memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor,
-				sizeof(crypt_info->ci_master_key));
-	if (S_ISREG(inode->i_mode))
-		mode = crypt_info->ci_data_mode;
-	else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
-		mode = crypt_info->ci_filename_mode;
-	else
-		BUG();
-
-	switch (mode) {
-	case F2FS_ENCRYPTION_MODE_AES_256_XTS:
-		cipher_str = "xts(aes)";
-		break;
-	case F2FS_ENCRYPTION_MODE_AES_256_CTS:
-		cipher_str = "cts(cbc(aes))";
-		break;
-	default:
-		printk_once(KERN_WARNING
-			    "f2fs: unsupported key mode %d (ino %u)\n",
-			    mode, (unsigned) inode->i_ino);
-		res = -ENOKEY;
-		goto out;
-	}
-
-	memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX,
-					F2FS_KEY_DESC_PREFIX_SIZE);
-	sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE,
-					"%*phN", F2FS_KEY_DESCRIPTOR_SIZE,
-					ctx.master_key_descriptor);
-	full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE +
-					(2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0';
-	keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL);
-	if (IS_ERR(keyring_key)) {
-		res = PTR_ERR(keyring_key);
-		keyring_key = NULL;
-		goto out;
-	}
-	crypt_info->ci_keyring_key = keyring_key;
-	if (keyring_key->type != &key_type_logon) {
-		printk_once(KERN_WARNING "f2fs: key type must be logon\n");
-		res = -ENOKEY;
-		goto out;
-	}
-	down_read(&keyring_key->sem);
-	ukp = user_key_payload(keyring_key);
-	if (ukp->datalen != sizeof(struct f2fs_encryption_key)) {
-		res = -EINVAL;
-		up_read(&keyring_key->sem);
-		goto out;
-	}
-	master_key = (struct f2fs_encryption_key *)ukp->data;
-	BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE !=
-				F2FS_KEY_DERIVATION_NONCE_SIZE);
-	if (master_key->size != F2FS_AES_256_XTS_KEY_SIZE) {
-		printk_once(KERN_WARNING
-				"f2fs: key size incorrect: %d\n",
-				master_key->size);
-		res = -ENOKEY;
-		up_read(&keyring_key->sem);
-		goto out;
-	}
-	res = f2fs_derive_key_aes(ctx.nonce, master_key->raw,
-				  raw_key);
-	up_read(&keyring_key->sem);
-	if (res)
-		goto out;
-
-	ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0);
-	if (!ctfm || IS_ERR(ctfm)) {
-		res = ctfm ? PTR_ERR(ctfm) : -ENOMEM;
-		printk(KERN_DEBUG
-		       "%s: error %d (inode %u) allocating crypto tfm\n",
-		       __func__, res, (unsigned) inode->i_ino);
-		goto out;
-	}
-	crypt_info->ci_ctfm = ctfm;
-	crypto_ablkcipher_clear_flags(ctfm, ~0);
-	crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm),
-			     CRYPTO_TFM_REQ_WEAK_KEY);
-	res = crypto_ablkcipher_setkey(ctfm, raw_key,
-					f2fs_encryption_key_size(mode));
-	if (res)
-		goto out;
-
-	memzero_explicit(raw_key, sizeof(raw_key));
-	if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) {
-		f2fs_free_crypt_info(crypt_info);
-		goto retry;
-	}
-	return 0;
-
-out:
-	if (res == -ENOKEY && !S_ISREG(inode->i_mode))
-		res = 0;
-
-	f2fs_free_crypt_info(crypt_info);
-	memzero_explicit(raw_key, sizeof(raw_key));
-	return res;
-}
-
-int f2fs_has_encryption_key(struct inode *inode)
-{
-	struct f2fs_inode_info *fi = F2FS_I(inode);
-
-	return (fi->i_crypt_info != NULL);
-}
diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c
deleted file mode 100644
index 596f02490f27..000000000000
--- a/fs/f2fs/crypto_policy.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * copied from linux/fs/ext4/crypto_policy.c
- *
- * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2015, Motorola Mobility.
- *
- * This contains encryption policy functions for f2fs with some modifications
- * to support f2fs-specific xattr APIs.
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-#include <linux/random.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/f2fs_fs.h>
-
-#include "f2fs.h"
-#include "xattr.h"
-
-static int f2fs_inode_has_encryption_context(struct inode *inode)
-{
-	int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-			F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL);
-	return (res > 0);
-}
-
-/*
- * check whether the policy is consistent with the encryption context
- * for the inode
- */
-static int f2fs_is_encryption_context_consistent_with_policy(
-	struct inode *inode, const struct f2fs_encryption_policy *policy)
-{
-	struct f2fs_encryption_context ctx;
-	int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
-				sizeof(ctx), NULL);
-
-	if (res != sizeof(ctx))
-		return 0;
-
-	return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor,
-				F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
-			(ctx.flags == policy->flags) &&
-			(ctx.contents_encryption_mode ==
-			 policy->contents_encryption_mode) &&
-			(ctx.filenames_encryption_mode ==
-			 policy->filenames_encryption_mode));
-}
-
-static int f2fs_create_encryption_context_from_policy(
-	struct inode *inode, const struct f2fs_encryption_policy *policy)
-{
-	struct f2fs_encryption_context ctx;
-
-	ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-	memcpy(ctx.master_key_descriptor, policy->master_key_descriptor,
-			F2FS_KEY_DESCRIPTOR_SIZE);
-
-	if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) {
-		printk(KERN_WARNING
-		       "%s: Invalid contents encryption mode %d\n", __func__,
-			policy->contents_encryption_mode);
-		return -EINVAL;
-	}
-
-	if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) {
-		printk(KERN_WARNING
-		       "%s: Invalid filenames encryption mode %d\n", __func__,
-			policy->filenames_encryption_mode);
-		return -EINVAL;
-	}
-
-	if (policy->flags & ~F2FS_POLICY_FLAGS_VALID)
-		return -EINVAL;
-
-	ctx.contents_encryption_mode = policy->contents_encryption_mode;
-	ctx.filenames_encryption_mode = policy->filenames_encryption_mode;
-	ctx.flags = policy->flags;
-	BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE);
-	get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
-
-	return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-			F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
-			sizeof(ctx), NULL, XATTR_CREATE);
-}
-
-int f2fs_process_policy(const struct f2fs_encryption_policy *policy,
-			struct inode *inode)
-{
-	if (policy->version != 0)
-		return -EINVAL;
-
-	if (!S_ISDIR(inode->i_mode))
-		return -EINVAL;
-
-	if (!f2fs_inode_has_encryption_context(inode)) {
-		if (!f2fs_empty_dir(inode))
-			return -ENOTEMPTY;
-		return f2fs_create_encryption_context_from_policy(inode,
-								  policy);
-	}
-
-	if (f2fs_is_encryption_context_consistent_with_policy(inode, policy))
-		return 0;
-
-	printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n",
-	       __func__);
-	return -EINVAL;
-}
-
-int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy)
-{
-	struct f2fs_encryption_context ctx;
-	int res;
-
-	if (!f2fs_encrypted_inode(inode))
-		return -ENODATA;
-
-	res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
-				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
-				&ctx, sizeof(ctx), NULL);
-	if (res != sizeof(ctx))
-		return -ENODATA;
-	if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1)
-		return -EINVAL;
-
-	policy->version = 0;
-	policy->contents_encryption_mode = ctx.contents_encryption_mode;
-	policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
-	policy->flags = ctx.flags;
-	memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
-			F2FS_KEY_DESCRIPTOR_SIZE);
-	return 0;
-}
-
-int f2fs_is_child_context_consistent_with_parent(struct inode *parent,
-						struct inode *child)
-{
-	struct f2fs_crypt_info *parent_ci, *child_ci;
-	int res;
-
-	if ((parent == NULL) || (child == NULL)) {
-		pr_err("parent %p child %p\n", parent, child);
-		BUG_ON(1);
-	}
-
-	/* no restrictions if the parent directory is not encrypted */
-	if (!f2fs_encrypted_inode(parent))
-		return 1;
-	/* if the child directory is not encrypted, this is always a problem */
-	if (!f2fs_encrypted_inode(child))
-		return 0;
-	res = f2fs_get_encryption_info(parent);
-	if (res)
-		return 0;
-	res = f2fs_get_encryption_info(child);
-	if (res)
-		return 0;
-	parent_ci = F2FS_I(parent)->i_crypt_info;
-	child_ci = F2FS_I(child)->i_crypt_info;
-	if (!parent_ci && !child_ci)
-		return 1;
-	if (!parent_ci || !child_ci)
-		return 0;
-
-	return (memcmp(parent_ci->ci_master_key,
-			child_ci->ci_master_key,
-			F2FS_KEY_DESCRIPTOR_SIZE) == 0 &&
-		(parent_ci->ci_data_mode == child_ci->ci_data_mode) &&
-		(parent_ci->ci_filename_mode == child_ci->ci_filename_mode) &&
-		(parent_ci->ci_flags == child_ci->ci_flags));
-}
-
-/**
- * f2fs_inherit_context() - Sets a child context from its parent
- * @parent: Parent inode from which the context is inherited.
- * @child:  Child inode that inherits the context from @parent.
- *
- * Return: Zero on success, non-zero otherwise
- */
-int f2fs_inherit_context(struct inode *parent, struct inode *child,
-						struct page *ipage)
-{
-	struct f2fs_encryption_context ctx;
-	struct f2fs_crypt_info *ci;
-	int res;
-
-	res = f2fs_get_encryption_info(parent);
-	if (res < 0)
-		return res;
-
-	ci = F2FS_I(parent)->i_crypt_info;
-	if (ci == NULL)
-		return -ENOKEY;
-
-	ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1;
-
-	ctx.contents_encryption_mode = ci->ci_data_mode;
-	ctx.filenames_encryption_mode = ci->ci_filename_mode;
-	ctx.flags = ci->ci_flags;
-	memcpy(ctx.master_key_descriptor, ci->ci_master_key,
-			F2FS_KEY_DESCRIPTOR_SIZE);
-
-	get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE);
-	return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION,
-				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx,
-				sizeof(ctx), ipage, XATTR_CREATE);
-}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9643d88a01af..e5c762b37239 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -34,9 +34,9 @@ static void f2fs_read_end_io(struct bio *bio)
 
 	if (f2fs_bio_encrypted(bio)) {
 		if (bio->bi_error) {
-			f2fs_release_crypto_ctx(bio->bi_private);
+			fscrypt_release_ctx(bio->bi_private);
 		} else {
-			f2fs_end_io_crypto_work(bio->bi_private, bio);
+			fscrypt_decrypt_bio_pages(bio->bi_private, bio);
 			return;
 		}
 	}
@@ -64,7 +64,7 @@ static void f2fs_write_end_io(struct bio *bio)
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
 
-		f2fs_restore_and_release_control_page(&page);
+		fscrypt_pullback_bio_page(&page, true);
 
 		if (unlikely(bio->bi_error)) {
 			set_bit(AS_EIO, &page->mapping->flags);
@@ -129,16 +129,10 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode,
 
 	bio_for_each_segment_all(bvec, io->bio, i) {
 
-		if (bvec->bv_page->mapping) {
+		if (bvec->bv_page->mapping)
 			target = bvec->bv_page;
-		} else {
-			struct f2fs_crypto_ctx *ctx;
-
-			/* encrypted page */
-			ctx = (struct f2fs_crypto_ctx *)page_private(
-								bvec->bv_page);
-			target = ctx->w.control_page;
-		}
+		else
+			target = fscrypt_control_page(bvec->bv_page);
 
 		if (inode && inode == target->mapping->host)
 			return true;
@@ -220,7 +214,8 @@ void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi)
 int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 {
 	struct bio *bio;
-	struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+	struct page *page = fio->encrypted_page ?
+			fio->encrypted_page : fio->page;
 
 	trace_f2fs_submit_page_bio(page, fio);
 	f2fs_trace_ios(fio, 0);
@@ -992,12 +987,12 @@ submit_and_realloc:
 			bio = NULL;
 		}
 		if (bio == NULL) {
-			struct f2fs_crypto_ctx *ctx = NULL;
+			struct fscrypt_ctx *ctx = NULL;
 
 			if (f2fs_encrypted_inode(inode) &&
 					S_ISREG(inode->i_mode)) {
 
-				ctx = f2fs_get_crypto_ctx(inode);
+				ctx = fscrypt_get_ctx(inode);
 				if (IS_ERR(ctx))
 					goto set_error_page;
 
@@ -1010,7 +1005,7 @@ submit_and_realloc:
 				min_t(int, nr_pages, BIO_MAX_PAGES));
 			if (!bio) {
 				if (ctx)
-					f2fs_release_crypto_ctx(ctx);
+					fscrypt_release_ctx(ctx);
 				goto set_error_page;
 			}
 			bio->bi_bdev = bdev;
@@ -1102,7 +1097,7 @@ int do_write_data_page(struct f2fs_io_info *fio)
 		f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
 							fio->old_blkaddr);
 
-		fio->encrypted_page = f2fs_encrypt(inode, fio->page);
+		fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page);
 		if (IS_ERR(fio->encrypted_page)) {
 			err = PTR_ERR(fio->encrypted_page);
 			goto out_writepage;
@@ -1608,7 +1603,7 @@ repeat:
 
 		/* avoid symlink page */
 		if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
-			err = f2fs_decrypt(page);
+			err = fscrypt_decrypt_page(page);
 			if (err)
 				goto fail;
 		}
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 8f09da0552ac..f82e28b121a8 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -77,7 +77,7 @@ static unsigned long dir_block_index(unsigned int level,
 }
 
 static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
-				struct f2fs_filename *fname,
+				struct fscrypt_name *fname,
 				f2fs_hash_t namehash,
 				int *max_slots,
 				struct page **res_page)
@@ -103,15 +103,15 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
 	return de;
 }
 
-struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname,
+struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
 			f2fs_hash_t namehash, int *max_slots,
 			struct f2fs_dentry_ptr *d)
 {
 	struct f2fs_dir_entry *de;
 	unsigned long bit_pos = 0;
 	int max_len = 0;
-	struct f2fs_str de_name = FSTR_INIT(NULL, 0);
-	struct f2fs_str *name = &fname->disk_name;
+	struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
+	struct fscrypt_str *name = &fname->disk_name;
 
 	if (max_slots)
 		*max_slots = 0;
@@ -157,7 +157,7 @@ found:
 
 static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 					unsigned int level,
-					struct f2fs_filename *fname,
+					struct fscrypt_name *fname,
 					struct page **res_page)
 {
 	struct qstr name = FSTR_TO_QSTR(&fname->disk_name);
@@ -218,12 +218,12 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 	struct f2fs_dir_entry *de = NULL;
 	unsigned int max_depth;
 	unsigned int level;
-	struct f2fs_filename fname;
+	struct fscrypt_name fname;
 	int err;
 
 	*res_page = NULL;
 
-	err = f2fs_fname_setup_filename(dir, child, 1, &fname);
+	err = fscrypt_setup_filename(dir, child, 1, &fname);
 	if (err)
 		return NULL;
 
@@ -251,7 +251,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 			break;
 	}
 out:
-	f2fs_fname_free_filename(&fname);
+	fscrypt_free_filename(&fname);
 	return de;
 }
 
@@ -413,7 +413,7 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir,
 			goto put_error;
 
 		if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) {
-			err = f2fs_inherit_context(dir, inode, page);
+			err = fscrypt_inherit_context(dir, inode, page, false);
 			if (err)
 				goto put_error;
 		}
@@ -536,11 +536,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name,
 	struct f2fs_dentry_block *dentry_blk = NULL;
 	struct f2fs_dentry_ptr d;
 	struct page *page = NULL;
-	struct f2fs_filename fname;
+	struct fscrypt_name fname;
 	struct qstr new_name;
 	int slots, err;
 
-	err = f2fs_fname_setup_filename(dir, name, 0, &fname);
+	err = fscrypt_setup_filename(dir, name, 0, &fname);
 	if (err)
 		return err;
 
@@ -639,7 +639,7 @@ fail:
 	kunmap(dentry_page);
 	f2fs_put_page(dentry_page, 1);
 out:
-	f2fs_fname_free_filename(&fname);
+	fscrypt_free_filename(&fname);
 	f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
 	return err;
 }
@@ -781,12 +781,12 @@ bool f2fs_empty_dir(struct inode *dir)
 }
 
 bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
-				unsigned int start_pos, struct f2fs_str *fstr)
+			unsigned int start_pos, struct fscrypt_str *fstr)
 {
 	unsigned char d_type = DT_UNKNOWN;
 	unsigned int bit_pos;
 	struct f2fs_dir_entry *de = NULL;
-	struct f2fs_str de_name = FSTR_INIT(NULL, 0);
+	struct fscrypt_str de_name = FSTR_INIT(NULL, 0);
 
 	bit_pos = ((unsigned long)ctx->pos % d->max);
 
@@ -820,8 +820,9 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 
 			memcpy(de_name.name, d->filename[bit_pos], de_name.len);
 
-			ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code,
-							&de_name, fstr);
+			ret = fscrypt_fname_disk_to_usr(d->inode,
+						(u32)de->hash_code, 0,
+						&de_name, fstr);
 			kfree(de_name.name);
 			if (ret < 0)
 				return true;
@@ -849,16 +850,15 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 	struct file_ra_state *ra = &file->f_ra;
 	unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
 	struct f2fs_dentry_ptr d;
-	struct f2fs_str fstr = FSTR_INIT(NULL, 0);
+	struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
 	int err = 0;
 
 	if (f2fs_encrypted_inode(inode)) {
-		err = f2fs_get_encryption_info(inode);
+		err = fscrypt_get_encryption_info(inode);
 		if (err)
 			return err;
 
-		err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN,
-								&fstr);
+		err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
 		if (err < 0)
 			return err;
 	}
@@ -898,14 +898,14 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 		f2fs_put_page(dentry_page, 1);
 	}
 out:
-	f2fs_fname_crypto_free_buffer(&fstr);
+	fscrypt_fname_free_buffer(&fstr);
 	return err;
 }
 
 static int f2fs_dir_open(struct inode *inode, struct file *filp)
 {
 	if (f2fs_encrypted_inode(inode))
-		return f2fs_get_encryption_info(inode) ? -EACCES : 0;
+		return fscrypt_get_encryption_info(inode) ? -EACCES : 0;
 	return 0;
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ffd03363989b..6447e9002807 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -22,6 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/fscrypto.h>
 
 #ifdef CONFIG_F2FS_CHECK_FS
 #define f2fs_bug_on(sbi, condition)	BUG_ON(condition)
@@ -231,12 +232,9 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
 #define F2FS_IOC_WRITE_CHECKPOINT	_IO(F2FS_IOCTL_MAGIC, 7)
 #define F2FS_IOC_DEFRAGMENT		_IO(F2FS_IOCTL_MAGIC, 8)
 
-#define F2FS_IOC_SET_ENCRYPTION_POLICY					\
-		_IOR('f', 19, struct f2fs_encryption_policy)
-#define F2FS_IOC_GET_ENCRYPTION_PWSALT					\
-		_IOW('f', 20, __u8[16])
-#define F2FS_IOC_GET_ENCRYPTION_POLICY					\
-		_IOW('f', 21, struct f2fs_encryption_policy)
+#define F2FS_IOC_SET_ENCRYPTION_POLICY	FS_IOC_SET_ENCRYPTION_POLICY
+#define F2FS_IOC_GET_ENCRYPTION_POLICY	FS_IOC_GET_ENCRYPTION_POLICY
+#define F2FS_IOC_GET_ENCRYPTION_PWSALT	FS_IOC_GET_ENCRYPTION_PWSALT
 
 /*
  * should be same as XFS_IOC_GOINGDOWN.
@@ -266,25 +264,6 @@ struct f2fs_defragment {
  * For INODE and NODE manager
  */
 /* for directory operations */
-struct f2fs_str {
-	unsigned char *name;
-	u32 len;
-};
-
-struct f2fs_filename {
-	const struct qstr *usr_fname;
-	struct f2fs_str disk_name;
-	f2fs_hash_t hash;
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	struct f2fs_str crypto_buf;
-#endif
-};
-
-#define FSTR_INIT(n, l)		{ .name = n, .len = l }
-#define FSTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
-#define fname_name(p)		((p)->disk_name.name)
-#define fname_len(p)		((p)->disk_name.len)
-
 struct f2fs_dentry_ptr {
 	struct inode *inode;
 	const void *bitmap;
@@ -412,15 +391,6 @@ struct f2fs_map_blocks {
 #define file_enc_name(inode)	is_file(inode, FADVISE_ENC_NAME_BIT)
 #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
 
-/* Encryption algorithms */
-#define F2FS_ENCRYPTION_MODE_INVALID		0
-#define F2FS_ENCRYPTION_MODE_AES_256_XTS	1
-#define F2FS_ENCRYPTION_MODE_AES_256_GCM	2
-#define F2FS_ENCRYPTION_MODE_AES_256_CBC	3
-#define F2FS_ENCRYPTION_MODE_AES_256_CTS	4
-
-#include "f2fs_crypto.h"
-
 #define DEF_DIR_LEVEL		0
 
 struct f2fs_inode_info {
@@ -444,13 +414,7 @@ struct f2fs_inode_info {
 	struct list_head dirty_list;	/* linked in global dirty list */
 	struct list_head inmem_pages;	/* inmemory pages managed by f2fs */
 	struct mutex inmem_lock;	/* lock for inmemory pages */
-
 	struct extent_tree *extent_tree;	/* cached extent_tree entry */
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	/* Encryption params */
-	struct f2fs_crypt_info *i_crypt_info;
-#endif
 };
 
 static inline void get_extent_info(struct extent_info *ext,
@@ -1741,10 +1705,10 @@ struct dentry *f2fs_get_parent(struct dentry *child);
 extern unsigned char f2fs_filetype_table[F2FS_FT_MAX];
 void set_de_type(struct f2fs_dir_entry *, umode_t);
 
-struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *,
+struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
 			f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
 bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
-			unsigned int, struct f2fs_str *);
+			unsigned int, struct fscrypt_str *);
 void do_make_empty_dir(struct inode *, struct inode *,
 			struct f2fs_dentry_ptr *);
 struct page *init_inode_metadata(struct inode *, struct inode *,
@@ -2120,7 +2084,7 @@ int f2fs_convert_inline_inode(struct inode *);
 int f2fs_write_inline_data(struct inode *, struct page *);
 bool recover_inline_data(struct inode *, struct page *);
 struct f2fs_dir_entry *find_in_inline_dir(struct inode *,
-				struct f2fs_filename *, struct page **);
+				struct fscrypt_name *, struct page **);
 struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **);
 int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *);
 int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *,
@@ -2129,7 +2093,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
 						struct inode *, struct inode *);
 bool f2fs_empty_inline_dir(struct inode *);
 int f2fs_read_inline_dir(struct file *, struct dir_context *,
-						struct f2fs_str *);
+						struct fscrypt_str *);
 int f2fs_inline_data_fiemap(struct inode *,
 		struct fiemap_extent_info *, __u64, __u64);
 
@@ -2159,13 +2123,9 @@ void destroy_extent_cache(void);
 /*
  * crypto support
  */
-static inline int f2fs_encrypted_inode(struct inode *inode)
+static inline bool f2fs_encrypted_inode(struct inode *inode)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
 	return file_is_encrypt(inode);
-#else
-	return 0;
-#endif
 }
 
 static inline void f2fs_set_encrypted_inode(struct inode *inode)
@@ -2177,20 +2137,12 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
 
 static inline bool f2fs_bio_encrypted(struct bio *bio)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	return unlikely(bio->bi_private != NULL);
-#else
-	return false;
-#endif
+	return bio->bi_private != NULL;
 }
 
 static inline int f2fs_sb_has_crypto(struct super_block *sb)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
-#else
-	return 0;
-#endif
 }
 
 static inline bool f2fs_may_encrypt(struct inode *inode)
@@ -2204,86 +2156,28 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
 #endif
 }
 
-/* crypto_policy.c */
-int f2fs_is_child_context_consistent_with_parent(struct inode *,
-							struct inode *);
-int f2fs_inherit_context(struct inode *, struct inode *, struct page *);
-int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *);
-int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *);
-
-/* crypt.c */
-extern struct kmem_cache *f2fs_crypt_info_cachep;
-bool f2fs_valid_contents_enc_mode(uint32_t);
-uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t);
-struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *);
-void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *);
-struct page *f2fs_encrypt(struct inode *, struct page *);
-int f2fs_decrypt(struct page *);
-void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *);
-
-/* crypto_key.c */
-void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *);
-int _f2fs_get_encryption_info(struct inode *inode);
-
-/* crypto_fname.c */
-bool f2fs_valid_filenames_enc_mode(uint32_t);
-u32 f2fs_fname_crypto_round_up(u32, u32);
-unsigned f2fs_fname_encrypted_size(struct inode *, u32);
-int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *);
-int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *,
-			const struct f2fs_str *, struct f2fs_str *);
-int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *,
-			struct f2fs_str *);
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-void f2fs_restore_and_release_control_page(struct page **);
-void f2fs_restore_control_page(struct page *);
-
-int __init f2fs_init_crypto(void);
-int f2fs_crypto_initialize(void);
-void f2fs_exit_crypto(void);
-
-int f2fs_has_encryption_key(struct inode *);
-
-static inline int f2fs_get_encryption_info(struct inode *inode)
-{
-	struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info;
-
-	if (!ci ||
-		(ci->ci_keyring_key &&
-		 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
-					       (1 << KEY_FLAG_REVOKED) |
-					       (1 << KEY_FLAG_DEAD)))))
-		return _f2fs_get_encryption_info(inode);
-	return 0;
-}
-
-void f2fs_fname_crypto_free_buffer(struct f2fs_str *);
-int f2fs_fname_setup_filename(struct inode *, const struct qstr *,
-				int lookup, struct f2fs_filename *);
-void f2fs_fname_free_filename(struct f2fs_filename *);
-#else
-static inline void f2fs_restore_and_release_control_page(struct page **p) { }
-static inline void f2fs_restore_control_page(struct page *p) { }
-
-static inline int __init f2fs_init_crypto(void) { return 0; }
-static inline void f2fs_exit_crypto(void) { }
-
-static inline int f2fs_has_encryption_key(struct inode *i) { return 0; }
-static inline int f2fs_get_encryption_info(struct inode *i) { return 0; }
-static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { }
-
-static inline int f2fs_fname_setup_filename(struct inode *dir,
-					const struct qstr *iname,
-					int lookup, struct f2fs_filename *fname)
-{
-	memset(fname, 0, sizeof(struct f2fs_filename));
-	fname->usr_fname = iname;
-	fname->disk_name.name = (unsigned char *)iname->name;
-	fname->disk_name.len = iname->len;
-	return 0;
-}
-
-static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { }
+#ifndef CONFIG_F2FS_FS_ENCRYPTION
+#define fscrypt_set_d_op(i)
+#define fscrypt_get_ctx			fscrypt_notsupp_get_ctx
+#define fscrypt_release_ctx		fscrypt_notsupp_release_ctx
+#define fscrypt_encrypt_page		fscrypt_notsupp_encrypt_page
+#define fscrypt_decrypt_page		fscrypt_notsupp_decrypt_page
+#define fscrypt_decrypt_bio_pages	fscrypt_notsupp_decrypt_bio_pages
+#define fscrypt_pullback_bio_page	fscrypt_notsupp_pullback_bio_page
+#define fscrypt_restore_control_page	fscrypt_notsupp_restore_control_page
+#define fscrypt_zeroout_range		fscrypt_notsupp_zeroout_range
+#define fscrypt_process_policy		fscrypt_notsupp_process_policy
+#define fscrypt_get_policy		fscrypt_notsupp_get_policy
+#define fscrypt_has_permitted_context	fscrypt_notsupp_has_permitted_context
+#define fscrypt_inherit_context		fscrypt_notsupp_inherit_context
+#define fscrypt_get_encryption_info	fscrypt_notsupp_get_encryption_info
+#define fscrypt_put_encryption_info	fscrypt_notsupp_put_encryption_info
+#define fscrypt_setup_filename		fscrypt_notsupp_setup_filename
+#define fscrypt_free_filename		fscrypt_notsupp_free_filename
+#define fscrypt_fname_encrypted_size	fscrypt_notsupp_fname_encrypted_size
+#define fscrypt_fname_alloc_buffer	fscrypt_notsupp_fname_alloc_buffer
+#define fscrypt_fname_free_buffer	fscrypt_notsupp_fname_free_buffer
+#define fscrypt_fname_disk_to_usr	fscrypt_notsupp_fname_disk_to_usr
+#define fscrypt_fname_usr_to_disk	fscrypt_notsupp_fname_usr_to_disk
 #endif
 #endif
diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h
deleted file mode 100644
index c2c1c2b63b25..000000000000
--- a/fs/f2fs/f2fs_crypto.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * linux/fs/f2fs/f2fs_crypto.h
- *
- * Copied from linux/fs/ext4/ext4_crypto.h
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * This contains encryption header content for f2fs
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-#ifndef _F2FS_CRYPTO_H
-#define _F2FS_CRYPTO_H
-
-#include <linux/fs.h>
-
-#define F2FS_KEY_DESCRIPTOR_SIZE	8
-
-/* Policy provided via an ioctl on the topmost directory */
-struct f2fs_encryption_policy {
-	char version;
-	char contents_encryption_mode;
-	char filenames_encryption_mode;
-	char flags;
-	char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
-} __attribute__((__packed__));
-
-#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1	1
-#define F2FS_KEY_DERIVATION_NONCE_SIZE		16
-
-#define F2FS_POLICY_FLAGS_PAD_4		0x00
-#define F2FS_POLICY_FLAGS_PAD_8		0x01
-#define F2FS_POLICY_FLAGS_PAD_16	0x02
-#define F2FS_POLICY_FLAGS_PAD_32	0x03
-#define F2FS_POLICY_FLAGS_PAD_MASK	0x03
-#define F2FS_POLICY_FLAGS_VALID		0x03
-
-/**
- * Encryption context for inode
- *
- * Protector format:
- *  1 byte: Protector format (1 = this version)
- *  1 byte: File contents encryption mode
- *  1 byte: File names encryption mode
- *  1 byte: Flags
- *  8 bytes: Master Key descriptor
- *  16 bytes: Encryption Key derivation nonce
- */
-struct f2fs_encryption_context {
-	char format;
-	char contents_encryption_mode;
-	char filenames_encryption_mode;
-	char flags;
-	char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE];
-	char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE];
-} __attribute__((__packed__));
-
-/* Encryption parameters */
-#define F2FS_XTS_TWEAK_SIZE 16
-#define F2FS_AES_128_ECB_KEY_SIZE 16
-#define F2FS_AES_256_GCM_KEY_SIZE 32
-#define F2FS_AES_256_CBC_KEY_SIZE 32
-#define F2FS_AES_256_CTS_KEY_SIZE 32
-#define F2FS_AES_256_XTS_KEY_SIZE 64
-#define F2FS_MAX_KEY_SIZE 64
-
-#define F2FS_KEY_DESC_PREFIX "f2fs:"
-#define F2FS_KEY_DESC_PREFIX_SIZE 5
-
-struct f2fs_encryption_key {
-	__u32 mode;
-	char raw[F2FS_MAX_KEY_SIZE];
-	__u32 size;
-} __attribute__((__packed__));
-
-struct f2fs_crypt_info {
-	char		ci_data_mode;
-	char		ci_filename_mode;
-	char		ci_flags;
-	struct crypto_ablkcipher *ci_ctfm;
-	struct key	*ci_keyring_key;
-	char		ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE];
-};
-
-#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL             0x00000001
-#define F2FS_WRITE_PATH_FL			      0x00000002
-
-struct f2fs_crypto_ctx {
-	union {
-		struct {
-			struct page *bounce_page;       /* Ciphertext page */
-			struct page *control_page;      /* Original page  */
-		} w;
-		struct {
-			struct bio *bio;
-			struct work_struct work;
-		} r;
-		struct list_head free_list;     /* Free list */
-	};
-	char flags;                      /* Flags */
-};
-
-struct f2fs_completion_result {
-	struct completion completion;
-	int res;
-};
-
-#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \
-	struct f2fs_completion_result ecr = { \
-		COMPLETION_INITIALIZER((ecr).completion), 0 }
-
-static inline int f2fs_encryption_key_size(int mode)
-{
-	switch (mode) {
-	case F2FS_ENCRYPTION_MODE_AES_256_XTS:
-		return F2FS_AES_256_XTS_KEY_SIZE;
-	case F2FS_ENCRYPTION_MODE_AES_256_GCM:
-		return F2FS_AES_256_GCM_KEY_SIZE;
-	case F2FS_ENCRYPTION_MODE_AES_256_CBC:
-		return F2FS_AES_256_CBC_KEY_SIZE;
-	case F2FS_ENCRYPTION_MODE_AES_256_CTS:
-		return F2FS_AES_256_CTS_KEY_SIZE;
-	default:
-		BUG();
-	}
-	return 0;
-}
-
-#define F2FS_FNAME_NUM_SCATTER_ENTRIES	4
-#define F2FS_CRYPTO_BLOCK_SIZE		16
-#define F2FS_FNAME_CRYPTO_DIGEST_SIZE	32
-
-/**
- * For encrypted symlinks, the ciphertext length is stored at the beginning
- * of the string in little-endian format.
- */
-struct f2fs_encrypted_symlink_data {
-	__le16 len;
-	char encrypted_path[1];
-} __attribute__((__packed__));
-
-/**
- * This function is used to calculate the disk space required to
- * store a filename of length l in encrypted symlink format.
- */
-static inline u32 encrypted_symlink_data_len(u32 l)
-{
-	return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1);
-}
-#endif	/* _F2FS_CRYPTO_H */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ffa1ec20f963..04ab1e4fc1df 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -421,7 +421,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	int err;
 
 	if (f2fs_encrypted_inode(inode)) {
-		err = f2fs_get_encryption_info(inode);
+		err = fscrypt_get_encryption_info(inode);
 		if (err)
 			return 0;
 		if (!f2fs_encrypted_inode(inode))
@@ -443,10 +443,10 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
 	int ret = generic_file_open(inode, filp);
 
 	if (!ret && f2fs_encrypted_inode(inode)) {
-		ret = f2fs_get_encryption_info(inode);
+		ret = fscrypt_get_encryption_info(inode);
 		if (ret)
 			return -EACCES;
-		if (!f2fs_encrypted_inode(inode))
+		if (!fscrypt_has_encryption_key(inode))
 			return -ENOKEY;
 	}
 	return ret;
@@ -526,7 +526,8 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
 truncate_out:
 	f2fs_wait_on_page_writeback(page, DATA, true);
 	zero_user(page, offset, PAGE_CACHE_SIZE - offset);
-	if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode))
+	if (!cache_only || !f2fs_encrypted_inode(inode) ||
+					!S_ISREG(inode->i_mode))
 		set_page_dirty(page);
 	f2fs_put_page(page, 1);
 	return 0;
@@ -674,7 +675,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (f2fs_encrypted_inode(inode) &&
-				f2fs_get_encryption_info(inode))
+				fscrypt_get_encryption_info(inode))
 			return -EACCES;
 
 		if (attr->ia_size <= i_size_read(inode)) {
@@ -1529,39 +1530,30 @@ static bool uuid_is_nonzero(__u8 u[16])
 
 static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	struct f2fs_encryption_policy policy;
+	struct fscrypt_policy policy;
 	struct inode *inode = file_inode(filp);
 
-	if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg,
-				sizeof(policy)))
+	if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
+							sizeof(policy)))
 		return -EFAULT;
 
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
-	return f2fs_process_policy(&policy, inode);
-#else
-	return -EOPNOTSUPP;
-#endif
+	return fscrypt_process_policy(inode, &policy);
 }
 
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
 {
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	struct f2fs_encryption_policy policy;
+	struct fscrypt_policy policy;
 	struct inode *inode = file_inode(filp);
 	int err;
 
-	err = f2fs_get_policy(inode, &policy);
+	err = fscrypt_get_policy(inode, &policy);
 	if (err)
 		return err;
 
-	if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy,
-							sizeof(policy)))
+	if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy)))
 		return -EFAULT;
 	return 0;
-#else
-	return -EOPNOTSUPP;
-#endif
 }
 
 static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
@@ -1873,8 +1865,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ssize_t ret;
 
 	if (f2fs_encrypted_inode(inode) &&
-				!f2fs_has_encryption_key(inode) &&
-				f2fs_get_encryption_info(inode))
+				!fscrypt_has_encryption_key(inode) &&
+				fscrypt_get_encryption_info(inode))
 		return -EACCES;
 
 	inode_lock(inode);
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 1c00f2c718ae..358214e9f707 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -277,7 +277,7 @@ process_inline:
 }
 
 struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir,
-			struct f2fs_filename *fname, struct page **res_page)
+			struct fscrypt_name *fname, struct page **res_page)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
 	struct f2fs_inline_dentry *inline_dentry;
@@ -535,7 +535,7 @@ bool f2fs_empty_inline_dir(struct inode *dir)
 }
 
 int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
-				struct f2fs_str *fstr)
+				struct fscrypt_str *fstr)
 {
 	struct inode *inode = file_inode(file);
 	struct f2fs_inline_dentry *inline_dentry = NULL;
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d4477073dbb0..cb269c46ac25 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -389,10 +389,7 @@ no_delete:
 		}
 	}
 out_clear:
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	if (fi->i_crypt_info)
-		f2fs_free_encryption_info(inode, fi->i_crypt_info);
-#endif
+	fscrypt_put_encryption_info(inode, NULL);
 	clear_inode(inode);
 }
 
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index b3c423a645bc..3bddd9f657e5 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -169,7 +169,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 	int err;
 
 	if (f2fs_encrypted_inode(dir) &&
-		!f2fs_is_child_context_consistent_with_parent(dir, inode))
+			!fscrypt_has_permitted_context(dir, inode))
 		return -EPERM;
 
 	f2fs_balance_fs(sbi, true);
@@ -352,20 +352,20 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
 	size_t len = strlen(symname);
-	struct f2fs_str disk_link = FSTR_INIT((char *)symname, len + 1);
-	struct f2fs_encrypted_symlink_data *sd = NULL;
+	struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1);
+	struct fscrypt_symlink_data *sd = NULL;
 	int err;
 
 	if (f2fs_encrypted_inode(dir)) {
-		err = f2fs_get_encryption_info(dir);
+		err = fscrypt_get_encryption_info(dir);
 		if (err)
 			return err;
 
-		if (!f2fs_encrypted_inode(dir))
+		if (!fscrypt_has_encryption_key(dir))
 			return -EPERM;
 
-		disk_link.len = (f2fs_fname_encrypted_size(dir, len) +
-				sizeof(struct f2fs_encrypted_symlink_data));
+		disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
+				sizeof(struct fscrypt_symlink_data));
 	}
 
 	if (disk_link.len > dir->i_sb->s_blocksize)
@@ -393,7 +393,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 
 	if (f2fs_encrypted_inode(inode)) {
 		struct qstr istr = QSTR_INIT(symname, len);
-		struct f2fs_str ostr;
+		struct fscrypt_str ostr;
 
 		sd = kzalloc(disk_link.len, GFP_NOFS);
 		if (!sd) {
@@ -401,18 +401,18 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 			goto err_out;
 		}
 
-		err = f2fs_get_encryption_info(inode);
+		err = fscrypt_get_encryption_info(inode);
 		if (err)
 			goto err_out;
 
-		if (!f2fs_encrypted_inode(inode)) {
+		if (!fscrypt_has_encryption_key(inode)) {
 			err = -EPERM;
 			goto err_out;
 		}
 
 		ostr.name = sd->encrypted_path;
 		ostr.len = disk_link.len;
-		err = f2fs_fname_usr_to_disk(inode, &istr, &ostr);
+		err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
 		if (err < 0)
 			goto err_out;
 
@@ -593,7 +593,7 @@ out:
 static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	if (f2fs_encrypted_inode(dir)) {
-		int err = f2fs_get_encryption_info(dir);
+		int err = fscrypt_get_encryption_info(dir);
 		if (err)
 			return err;
 	}
@@ -623,8 +623,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int err = -ENOENT;
 
 	if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) &&
-		!f2fs_is_child_context_consistent_with_parent(new_dir,
-							old_inode)) {
+			!fscrypt_has_permitted_context(new_dir, old_inode)) {
 		err = -EPERM;
 		goto out;
 	}
@@ -804,11 +803,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int err = -ENOENT;
 
 	if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) &&
-		(old_dir != new_dir) &&
-		(!f2fs_is_child_context_consistent_with_parent(new_dir,
-								old_inode) ||
-		!f2fs_is_child_context_consistent_with_parent(old_dir,
-								new_inode)))
+			(old_dir != new_dir) &&
+			(!fscrypt_has_permitted_context(new_dir, old_inode) ||
+			 !fscrypt_has_permitted_context(old_dir, new_inode)))
 		return -EPERM;
 
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
@@ -970,16 +967,15 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 	return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
 static const char *f2fs_encrypted_get_link(struct dentry *dentry,
 					   struct inode *inode,
 					   struct delayed_call *done)
 {
 	struct page *cpage = NULL;
 	char *caddr, *paddr = NULL;
-	struct f2fs_str cstr = FSTR_INIT(NULL, 0);
-	struct f2fs_str pstr = FSTR_INIT(NULL, 0);
-	struct f2fs_encrypted_symlink_data *sd;
+	struct fscrypt_str cstr = FSTR_INIT(NULL, 0);
+	struct fscrypt_str pstr = FSTR_INIT(NULL, 0);
+	struct fscrypt_symlink_data *sd;
 	loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
 	u32 max_size = inode->i_sb->s_blocksize;
 	int res;
@@ -987,7 +983,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
 	if (!dentry)
 		return ERR_PTR(-ECHILD);
 
-	res = f2fs_get_encryption_info(inode);
+	res = fscrypt_get_encryption_info(inode);
 	if (res)
 		return ERR_PTR(res);
 
@@ -998,7 +994,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
 	caddr[size] = 0;
 
 	/* Symlink is encrypted */
-	sd = (struct f2fs_encrypted_symlink_data *)caddr;
+	sd = (struct fscrypt_symlink_data *)caddr;
 	cstr.name = sd->encrypted_path;
 	cstr.len = le16_to_cpu(sd->len);
 
@@ -1014,17 +1010,16 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
 		goto errout;
 	}
 
-	if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) >
-								max_size) {
+	if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) {
 		/* Symlink data on the disk is corrupted */
 		res = -EIO;
 		goto errout;
 	}
-	res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr);
+	res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
 	if (res)
 		goto errout;
 
-	res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr);
+	res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
 	if (res < 0)
 		goto errout;
 
@@ -1037,7 +1032,7 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry,
 	set_delayed_call(done, kfree_link, paddr);
 	return paddr;
 errout:
-	f2fs_fname_crypto_free_buffer(&pstr);
+	fscrypt_fname_free_buffer(&pstr);
 	page_cache_release(cpage);
 	return ERR_PTR(res);
 }
@@ -1054,7 +1049,6 @@ const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 };
-#endif
 
 const struct inode_operations f2fs_dir_inode_operations = {
 	.create		= f2fs_create,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 579372d9291f..7b62016e66cd 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -470,10 +470,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 
 	/* Will be used by directory only */
 	fi->i_dir_level = F2FS_SB(sb)->dir_level;
-
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-	fi->i_crypt_info = NULL;
-#endif
 	return &fi->vfs_inode;
 }
 
@@ -507,11 +503,7 @@ static int f2fs_drop_inode(struct inode *inode)
 
 			sb_end_intwrite(inode->i_sb);
 
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-			if (F2FS_I(inode)->i_crypt_info)
-				f2fs_free_encryption_info(inode,
-					F2FS_I(inode)->i_crypt_info);
-#endif
+			fscrypt_put_encryption_info(inode, NULL);
 			spin_lock(&inode->i_lock);
 			atomic_dec(&inode->i_count);
 		}
@@ -891,6 +883,41 @@ static struct super_operations f2fs_sops = {
 	.remount_fs	= f2fs_remount,
 };
 
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
+static int f2fs_get_context(struct inode *inode, void *ctx, size_t len)
+{
+	return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+				ctx, len, NULL);
+}
+
+static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len,
+							void *fs_data)
+{
+	return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION,
+				F2FS_XATTR_NAME_ENCRYPTION_CONTEXT,
+				ctx, len, fs_data, XATTR_CREATE);
+}
+
+static unsigned f2fs_max_namelen(struct inode *inode)
+{
+	return S_ISLNK(inode->i_mode) ?
+			inode->i_sb->s_blocksize : F2FS_NAME_LEN;
+}
+
+static struct fscrypt_operations f2fs_cryptops = {
+	.get_context	= f2fs_get_context,
+	.set_context	= f2fs_set_context,
+	.is_encrypted	= f2fs_encrypted_inode,
+	.empty_dir	= f2fs_empty_dir,
+	.max_namelen	= f2fs_max_namelen,
+};
+#else
+static struct fscrypt_operations f2fs_cryptops = {
+	.is_encrypted	= f2fs_encrypted_inode,
+};
+#endif
+
 static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
 		u64 ino, u32 generation)
 {
@@ -1314,6 +1341,7 @@ try_onemore:
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 
 	sb->s_op = &f2fs_sops;
+	sb->s_cop = &f2fs_cryptops;
 	sb->s_xattr = f2fs_xattr_handlers;
 	sb->s_export_op = &f2fs_export_ops;
 	sb->s_magic = F2FS_SUPER_MAGIC;
@@ -1619,13 +1647,9 @@ static int __init init_f2fs_fs(void)
 		err = -ENOMEM;
 		goto free_extent_cache;
 	}
-	err = f2fs_init_crypto();
-	if (err)
-		goto free_kset;
-
 	err = register_shrinker(&f2fs_shrinker_info);
 	if (err)
-		goto free_crypto;
+		goto free_kset;
 
 	err = register_filesystem(&f2fs_fs_type);
 	if (err)
@@ -1640,8 +1664,6 @@ free_filesystem:
 	unregister_filesystem(&f2fs_fs_type);
 free_shrinker:
 	unregister_shrinker(&f2fs_shrinker_info);
-free_crypto:
-	f2fs_exit_crypto();
 free_kset:
 	kset_unregister(f2fs_kset);
 free_extent_cache:
@@ -1664,7 +1686,6 @@ static void __exit exit_f2fs_fs(void)
 	f2fs_destroy_root_stats();
 	unregister_shrinker(&f2fs_shrinker_info);
 	unregister_filesystem(&f2fs_fs_type);
-	f2fs_exit_crypto();
 	destroy_extent_cache();
 	destroy_checkpoint_caches();
 	destroy_segment_manager_caches();
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 7781ce110503..c7bdfc54cec5 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -228,6 +228,8 @@ struct dentry_operations {
 #define DCACHE_FALLTHRU			0x01000000 /* Fall through to lower layer */
 #define DCACHE_OP_SELECT_INODE		0x02000000 /* Unioned entry: dcache op selects inode */
 
+#define DCACHE_ENCRYPTED_WITH_KEY	0x04000000 /* dir is encrypted with a valid key */
+
 extern seqlock_t rename_lock;
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ae681002100a..28fc12189997 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -53,6 +53,8 @@ struct swap_info_struct;
 struct seq_file;
 struct workqueue_struct;
 struct iov_iter;
+struct fscrypt_info;
+struct fscrypt_operations;
 
 extern void __init inode_init(void);
 extern void __init inode_init_early(void);
@@ -678,6 +680,10 @@ struct inode {
 	struct hlist_head	i_fsnotify_marks;
 #endif
 
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	struct fscrypt_info	*i_crypt_info;
+#endif
+
 	void			*i_private; /* fs or device private pointer */
 };
 
@@ -1323,6 +1329,8 @@ struct super_block {
 #endif
 	const struct xattr_handler **s_xattr;
 
+	const struct fscrypt_operations	*s_cop;
+
 	struct hlist_bl_head	s_anon;		/* anonymous dentries for (nfs) exporting */
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
 	struct block_device	*s_bdev;
diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h
new file mode 100644
index 000000000000..895cdac4fcdd
--- /dev/null
+++ b/include/linux/fscrypto.h
@@ -0,0 +1,433 @@
+/*
+ * General per-file encryption definition
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+
+#ifndef _LINUX_FSCRYPTO_H
+#define _LINUX_FSCRYPTO_H
+
+#include <linux/key.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/bio.h>
+#include <linux/dcache.h>
+#include <uapi/linux/fs.h>
+
+#define FS_KEY_DERIVATION_NONCE_SIZE		16
+#define FS_ENCRYPTION_CONTEXT_FORMAT_V1		1
+
+#define FS_POLICY_FLAGS_PAD_4		0x00
+#define FS_POLICY_FLAGS_PAD_8		0x01
+#define FS_POLICY_FLAGS_PAD_16		0x02
+#define FS_POLICY_FLAGS_PAD_32		0x03
+#define FS_POLICY_FLAGS_PAD_MASK	0x03
+#define FS_POLICY_FLAGS_VALID		0x03
+
+/* Encryption algorithms */
+#define FS_ENCRYPTION_MODE_INVALID		0
+#define FS_ENCRYPTION_MODE_AES_256_XTS		1
+#define FS_ENCRYPTION_MODE_AES_256_GCM		2
+#define FS_ENCRYPTION_MODE_AES_256_CBC		3
+#define FS_ENCRYPTION_MODE_AES_256_CTS		4
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ *  1 byte: Protector format (1 = this version)
+ *  1 byte: File contents encryption mode
+ *  1 byte: File names encryption mode
+ *  1 byte: Flags
+ *  8 bytes: Master Key descriptor
+ *  16 bytes: Encryption Key derivation nonce
+ */
+struct fscrypt_context {
+	u8 format;
+	u8 contents_encryption_mode;
+	u8 filenames_encryption_mode;
+	u8 flags;
+	u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+	u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+} __packed;
+
+/* Encryption parameters */
+#define FS_XTS_TWEAK_SIZE		16
+#define FS_AES_128_ECB_KEY_SIZE		16
+#define FS_AES_256_GCM_KEY_SIZE		32
+#define FS_AES_256_CBC_KEY_SIZE		32
+#define FS_AES_256_CTS_KEY_SIZE		32
+#define FS_AES_256_XTS_KEY_SIZE		64
+#define FS_MAX_KEY_SIZE			64
+
+#define FS_KEY_DESC_PREFIX		"fscrypt:"
+#define FS_KEY_DESC_PREFIX_SIZE		8
+
+/* This is passed in from userspace into the kernel keyring */
+struct fscrypt_key {
+	u32 mode;
+	u8 raw[FS_MAX_KEY_SIZE];
+	u32 size;
+} __packed;
+
+struct fscrypt_info {
+	u8 ci_data_mode;
+	u8 ci_filename_mode;
+	u8 ci_flags;
+	struct crypto_ablkcipher *ci_ctfm;
+	struct key *ci_keyring_key;
+	u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
+};
+
+#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
+#define FS_WRITE_PATH_FL			0x00000002
+
+struct fscrypt_ctx {
+	union {
+		struct {
+			struct page *bounce_page;	/* Ciphertext page */
+			struct page *control_page;	/* Original page  */
+		} w;
+		struct {
+			struct bio *bio;
+			struct work_struct work;
+		} r;
+		struct list_head free_list;	/* Free list */
+	};
+	u8 flags;				/* Flags */
+	u8 mode;				/* Encryption mode for tfm */
+};
+
+struct fscrypt_completion_result {
+	struct completion completion;
+	int res;
+};
+
+#define DECLARE_FS_COMPLETION_RESULT(ecr) \
+	struct fscrypt_completion_result ecr = { \
+		COMPLETION_INITIALIZER((ecr).completion), 0 }
+
+static inline int fscrypt_key_size(int mode)
+{
+	switch (mode) {
+	case FS_ENCRYPTION_MODE_AES_256_XTS:
+		return FS_AES_256_XTS_KEY_SIZE;
+	case FS_ENCRYPTION_MODE_AES_256_GCM:
+		return FS_AES_256_GCM_KEY_SIZE;
+	case FS_ENCRYPTION_MODE_AES_256_CBC:
+		return FS_AES_256_CBC_KEY_SIZE;
+	case FS_ENCRYPTION_MODE_AES_256_CTS:
+		return FS_AES_256_CTS_KEY_SIZE;
+	default:
+		BUG();
+	}
+	return 0;
+}
+
+#define FS_FNAME_NUM_SCATTER_ENTRIES	4
+#define FS_CRYPTO_BLOCK_SIZE		16
+#define FS_FNAME_CRYPTO_DIGEST_SIZE	32
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct fscrypt_symlink_data {
+	__le16 len;
+	char encrypted_path[1];
+} __packed;
+
+/**
+ * This function is used to calculate the disk space required to
+ * store a filename of length l in encrypted symlink format.
+ */
+static inline u32 fscrypt_symlink_data_len(u32 l)
+{
+	if (l < FS_CRYPTO_BLOCK_SIZE)
+		l = FS_CRYPTO_BLOCK_SIZE;
+	return (l + sizeof(struct fscrypt_symlink_data) - 1);
+}
+
+struct fscrypt_str {
+	unsigned char *name;
+	u32 len;
+};
+
+struct fscrypt_name {
+	const struct qstr *usr_fname;
+	struct fscrypt_str disk_name;
+	u32 hash;
+	u32 minor_hash;
+	struct fscrypt_str crypto_buf;
+};
+
+#define FSTR_INIT(n, l)		{ .name = n, .len = l }
+#define FSTR_TO_QSTR(f)		QSTR_INIT((f)->name, (f)->len)
+#define fname_name(p)		((p)->disk_name.name)
+#define fname_len(p)		((p)->disk_name.len)
+
+/*
+ * crypto opertions for filesystems
+ */
+struct fscrypt_operations {
+	int (*get_context)(struct inode *, void *, size_t);
+	int (*prepare_context)(struct inode *);
+	int (*set_context)(struct inode *, const void *, size_t, void *);
+	int (*dummy_context)(struct inode *);
+	bool (*is_encrypted)(struct inode *);
+	bool (*empty_dir)(struct inode *);
+	unsigned (*max_namelen)(struct inode *);
+};
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+	if (inode->i_sb->s_cop->dummy_context &&
+				inode->i_sb->s_cop->dummy_context(inode))
+		return true;
+	return false;
+}
+
+static inline bool fscrypt_valid_contents_enc_mode(u32 mode)
+{
+	return (mode == FS_ENCRYPTION_MODE_AES_256_XTS);
+}
+
+static inline bool fscrypt_valid_filenames_enc_mode(u32 mode)
+{
+	return (mode == FS_ENCRYPTION_MODE_AES_256_CTS);
+}
+
+static inline u32 fscrypt_validate_encryption_key_size(u32 mode, u32 size)
+{
+	if (size == fscrypt_key_size(mode))
+		return size;
+	return 0;
+}
+
+static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
+{
+	if (str->len == 1 && str->name[0] == '.')
+		return true;
+
+	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+		return true;
+
+	return false;
+}
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
+#else
+	WARN_ON_ONCE(1);
+	return ERR_PTR(-EINVAL);
+#endif
+}
+
+static inline int fscrypt_has_encryption_key(struct inode *inode)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	return (inode->i_crypt_info != NULL);
+#else
+	return 0;
+#endif
+}
+
+static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	spin_lock(&dentry->d_lock);
+	dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
+	spin_unlock(&dentry->d_lock);
+#endif
+}
+
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+extern const struct dentry_operations fscrypt_d_ops;
+#endif
+
+static inline void fscrypt_set_d_op(struct dentry *dentry)
+{
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+	d_set_d_op(dentry, &fscrypt_d_ops);
+#endif
+}
+
+#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
+/* crypto.c */
+extern struct kmem_cache *fscrypt_info_cachep;
+int fscrypt_initialize(void);
+
+extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *);
+extern void fscrypt_release_ctx(struct fscrypt_ctx *);
+extern struct page *fscrypt_encrypt_page(struct inode *, struct page *);
+extern int fscrypt_decrypt_page(struct page *);
+extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *);
+extern void fscrypt_pullback_bio_page(struct page **, bool);
+extern void fscrypt_restore_control_page(struct page *);
+extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t,
+						unsigned int);
+/* policy.c */
+extern int fscrypt_process_policy(struct inode *,
+					const struct fscrypt_policy *);
+extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *);
+extern int fscrypt_has_permitted_context(struct inode *, struct inode *);
+extern int fscrypt_inherit_context(struct inode *, struct inode *,
+					void *, bool);
+/* keyinfo.c */
+extern int get_crypt_info(struct inode *);
+extern int fscrypt_get_encryption_info(struct inode *);
+extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *);
+
+/* fname.c */
+extern int fscrypt_setup_filename(struct inode *, const struct qstr *,
+				int lookup, struct fscrypt_name *);
+extern void fscrypt_free_filename(struct fscrypt_name *);
+extern u32 fscrypt_fname_encrypted_size(struct inode *, u32);
+extern int fscrypt_fname_alloc_buffer(struct inode *, u32,
+				struct fscrypt_str *);
+extern void fscrypt_fname_free_buffer(struct fscrypt_str *);
+extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32,
+			const struct fscrypt_str *, struct fscrypt_str *);
+extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *,
+			struct fscrypt_str *);
+#endif
+
+/* crypto.c */
+static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c)
+{
+	return;
+}
+
+static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i,
+						struct page *p)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline int fscrypt_notsupp_decrypt_page(struct page *p)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_notsupp_decrypt_bio_pages(struct fscrypt_ctx *c,
+						struct bio *b)
+{
+	return;
+}
+
+static inline void fscrypt_notsupp_pullback_bio_page(struct page **p, bool b)
+{
+	return;
+}
+
+static inline void fscrypt_notsupp_restore_control_page(struct page *p)
+{
+	return;
+}
+
+static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p,
+					sector_t s, unsigned int f)
+{
+	return -EOPNOTSUPP;
+}
+
+/* policy.c */
+static inline int fscrypt_notsupp_process_policy(struct inode *i,
+				const struct fscrypt_policy *p)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_notsupp_get_policy(struct inode *i,
+				struct fscrypt_policy *p)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_notsupp_has_permitted_context(struct inode *p,
+				struct inode *i)
+{
+	return 0;
+}
+
+static inline int fscrypt_notsupp_inherit_context(struct inode *p,
+				struct inode *i, void *v, bool b)
+{
+	return -EOPNOTSUPP;
+}
+
+/* keyinfo.c */
+static inline int fscrypt_notsupp_get_encryption_info(struct inode *i)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_notsupp_put_encryption_info(struct inode *i,
+					struct fscrypt_info *f)
+{
+	return;
+}
+
+ /* fname.c */
+static inline int fscrypt_notsupp_setup_filename(struct inode *dir,
+			const struct qstr *iname,
+			int lookup, struct fscrypt_name *fname)
+{
+	if (dir->i_sb->s_cop->is_encrypted(dir))
+		return -EOPNOTSUPP;
+
+	memset(fname, 0, sizeof(struct fscrypt_name));
+	fname->usr_fname = iname;
+	fname->disk_name.name = (unsigned char *)iname->name;
+	fname->disk_name.len = iname->len;
+	return 0;
+}
+
+static inline void fscrypt_notsupp_free_filename(struct fscrypt_name *fname)
+{
+	return;
+}
+
+static inline u32 fscrypt_notsupp_fname_encrypted_size(struct inode *i, u32 s)
+{
+	/* never happens */
+	WARN_ON(1);
+	return 0;
+}
+
+static inline int fscrypt_notsupp_fname_alloc_buffer(struct inode *inode,
+				u32 ilen, struct fscrypt_str *crypto_str)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void fscrypt_notsupp_fname_free_buffer(struct fscrypt_str *c)
+{
+	return;
+}
+
+static inline int fscrypt_notsupp_fname_disk_to_usr(struct inode *inode,
+			u32 hash, u32 minor_hash,
+			const struct fscrypt_str *iname,
+			struct fscrypt_str *oname)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int fscrypt_notsupp_fname_usr_to_disk(struct inode *inode,
+			const struct qstr *iname,
+			struct fscrypt_str *oname)
+{
+	return -EOPNOTSUPP;
+}
+#endif	/* _LINUX_FSCRYPTO_H */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 149bec83a907..cec100b6dfa9 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -246,6 +246,24 @@ struct fsxattr {
 #define FS_IOC_FSGETXATTR		_IOR ('X', 31, struct fsxattr)
 #define FS_IOC_FSSETXATTR		_IOW ('X', 32, struct fsxattr)
 
+/*
+ * File system encryption support
+ */
+/* Policy provided via an ioctl on the topmost directory */
+#define FS_KEY_DESCRIPTOR_SIZE	8
+
+struct fscrypt_policy {
+	__u8 version;
+	__u8 contents_encryption_mode;
+	__u8 filenames_encryption_mode;
+	__u8 flags;
+	__u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+} __packed;
+
+#define FS_IOC_SET_ENCRYPTION_POLICY	_IOR('f', 19, struct fscrypt_policy)
+#define FS_IOC_GET_ENCRYPTION_PWSALT	_IOW('f', 20, __u8[16])
+#define FS_IOC_GET_ENCRYPTION_POLICY	_IOW('f', 21, struct fscrypt_policy)
+
 /*
  * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
  *
-- 
cgit v1.2.3


From 93e68cd6115f67d8363c94dae8206af36f6d3b00 Mon Sep 17 00:00:00 2001
From: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Date: Wed, 16 Mar 2016 09:12:46 +0000
Subject: net: fix a comment typo

Fix a comment typo.

Signed-off-by: Zhang Shengju <zhangshengju@cmss.chinamobile.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if.h b/include/uapi/linux/if.h
index 9cf2394f0bcf..f80277569f24 100644
--- a/include/uapi/linux/if.h
+++ b/include/uapi/linux/if.h
@@ -37,7 +37,7 @@
  * are shared for all types of net_devices. The sysfs entries are available
  * via /sys/class/net/<dev>/flags. Flags which can be toggled through sysfs
  * are annotated below, note that only a few flags can be toggled and some
- * other flags are always always preserved from the original net_device flags
+ * other flags are always preserved from the original net_device flags
  * even if you try to set them via sysfs. Flags which are always preserved
  * are kept under the flag grouping @IFF_VOLATILE. Flags which are volatile
  * are annotated below as such.
-- 
cgit v1.2.3


From c70ce028e834f8e51306217dbdbd441d851c64d3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 21 Mar 2016 09:55:10 -0700
Subject: net/rtnetlink: add IFLA_GSO_MAX_SEGS and IFLA_GSO_MAX_SIZE attributes

It can be useful to report dev->gso_max_segs and dev->gso_max_size
so that "ip -d link" can display them to help debugging.

For the moment, these attributes are read-only.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Petri Gynther <pgynther@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/if_link.h | 2 ++
 net/core/rtnetlink.c         | 4 ++++
 2 files changed, 6 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8e3f88fa5b59..76d315ca597d 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -153,6 +153,8 @@ enum {
 	IFLA_LINK_NETNSID,
 	IFLA_PHYS_PORT_NAME,
 	IFLA_PROTO_DOWN,
+	IFLA_GSO_MAX_SEGS,
+	IFLA_GSO_MAX_SIZE,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d2d9e5ebf58e..a69cd0c097b5 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -895,6 +895,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4) /* IFLA_PROMISCUITY */
 	       + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
 	       + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+	       + nla_total_size(4) /* IFLA_MAX_GSO_SEGS */
+	       + nla_total_size(4) /* IFLA_MAX_GSO_SIZE */
 	       + nla_total_size(1) /* IFLA_OPERSTATE */
 	       + nla_total_size(1) /* IFLA_LINKMODE */
 	       + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
@@ -1223,6 +1225,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	    nla_put_u32(skb, IFLA_GROUP, dev->group) ||
 	    nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
 	    nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
+	    nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
+	    nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
 #ifdef CONFIG_RPS
 	    nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
 #endif
-- 
cgit v1.2.3


From cc8e27cc97318471b7e707932d5b93b0d5f70830 Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Fri, 11 Mar 2016 22:58:34 +0200
Subject: net/core: Add support for configuring VF GUIDs

Add two new NLAs to support configuration of Infiniband node or port
GUIDs. New applications can choose to use this interface to configure
GUIDs with iproute2 with commands such as:

ip link set dev ib0 vf 0 node_guid 00:02:c9:03:00:21:6e:70
ip link set dev ib0 vf 0 port_guid 00:02:c9:03:00:21:6e:78

A new ndo, ndo_sef_vf_guid is introduced to notify the net device of the
request to change the GUID.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
---
 include/linux/netdevice.h    |  3 +++
 include/uapi/linux/if_link.h |  7 +++++++
 net/core/rtnetlink.c         | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5440b7b705eb..7b4ae218b90b 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1147,6 +1147,9 @@ struct net_device_ops {
 						   struct nlattr *port[]);
 	int			(*ndo_get_vf_port)(struct net_device *dev,
 						   int vf, struct sk_buff *skb);
+	int			(*ndo_set_vf_guid)(struct net_device *dev,
+						   int vf, u64 guid,
+						   int guid_type);
 	int			(*ndo_set_vf_rss_query_en)(
 						   struct net_device *dev,
 						   int vf, bool setting);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index a30b78090594..1d01e8a4e5dd 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -556,6 +556,8 @@ enum {
 				 */
 	IFLA_VF_STATS,		/* network device statistics */
 	IFLA_VF_TRUST,		/* Trust VF */
+	IFLA_VF_IB_NODE_GUID,	/* VF Infiniband node GUID */
+	IFLA_VF_IB_PORT_GUID,	/* VF Infiniband port GUID */
 	__IFLA_VF_MAX,
 };
 
@@ -588,6 +590,11 @@ struct ifla_vf_spoofchk {
 	__u32 setting;
 };
 
+struct ifla_vf_guid {
+	__u32 vf;
+	__u64 guid;
+};
+
 enum {
 	IFLA_VF_LINK_STATE_AUTO,	/* link state of the uplink */
 	IFLA_VF_LINK_STATE_ENABLE,	/* link always up */
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d735e854f916..4b6f3db9f8af 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1387,6 +1387,8 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
 	[IFLA_VF_RSS_QUERY_EN]	= { .len = sizeof(struct ifla_vf_rss_query_en) },
 	[IFLA_VF_STATS]		= { .type = NLA_NESTED },
 	[IFLA_VF_TRUST]		= { .len = sizeof(struct ifla_vf_trust) },
+	[IFLA_VF_IB_NODE_GUID]	= { .len = sizeof(struct ifla_vf_guid) },
+	[IFLA_VF_IB_PORT_GUID]	= { .len = sizeof(struct ifla_vf_guid) },
 };
 
 static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = {
@@ -1534,6 +1536,22 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
 	return 0;
 }
 
+static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt,
+				  int guid_type)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type);
+}
+
+static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type)
+{
+	if (dev->type != ARPHRD_INFINIBAND)
+		return -EOPNOTSUPP;
+
+	return handle_infiniband_guid(dev, ivt, guid_type);
+}
+
 static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
@@ -1636,6 +1654,24 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
 			return err;
 	}
 
+	if (tb[IFLA_VF_IB_NODE_GUID]) {
+		struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]);
+
+		if (!ops->ndo_set_vf_guid)
+			return -EOPNOTSUPP;
+
+		return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID);
+	}
+
+	if (tb[IFLA_VF_IB_PORT_GUID]) {
+		struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]);
+
+		if (!ops->ndo_set_vf_guid)
+			return -EOPNOTSUPP;
+
+		return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID);
+	}
+
 	return err;
 }
 
-- 
cgit v1.2.3


From 5f2d472450c94402627f54b603d271076f3f97c2 Mon Sep 17 00:00:00 2001
From: David Decotigny <decot@googlers.com>
Date: Mon, 21 Mar 2016 10:15:34 -0700
Subject: ethtool: minor doc update

Updates: commit 793cf87de9d1 ("ethtool: Set cmd field in
         ETHTOOL_GLINKSETTINGS response to wrong nwords")

Signed-off-by: David Decotigny <decot@googlers.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/ethtool.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index 2835b07416b7..9222db8ccccc 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -1648,9 +1648,9 @@ enum ethtool_reset_flags {
  *	%ETHTOOL_GLINKSETTINGS: on entry, number of words passed by user
  *	(>= 0); on return, if handshake in progress, negative if
  *	request size unsupported by kernel: absolute value indicates
- *	kernel recommended size and cmd field is 0, as well as all the
- *	other fields; otherwise (handshake completed), strictly
- *	positive to indicate size used by kernel and cmd field is
+ *	kernel expected size and all the other fields but cmd
+ *	are 0; otherwise (handshake completed), strictly positive
+ *	to indicate size used by kernel and cmd field stays
  *	%ETHTOOL_GLINKSETTINGS, all other fields populated by driver. For
  *	%ETHTOOL_SLINKSETTINGS: must be valid on entry, ie. a positive
  *	value returned previously by %ETHTOOL_GLINKSETTINGS, otherwise
-- 
cgit v1.2.3


From e8de370188d098bb49483c287b44925957c3c9b6 Mon Sep 17 00:00:00 2001
From: Alexandre Bounine <alexandre.bounine@idt.com>
Date: Tue, 22 Mar 2016 14:27:08 -0700
Subject: rapidio: add mport char device driver

Add mport character device driver to provide user space interface to
basic RapidIO subsystem operations.

See included Documentation/rapidio/mport_cdev.txt for more details.

[akpm@linux-foundation.org: fix printk warning on i386]
[dan.carpenter@oracle.com: mport_cdev: fix some error codes]
Signed-off-by: Alexandre Bounine <alexandre.bounine@idt.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Tested-by: Barry Wood <barry.wood@idt.com>
Cc: Matt Porter <mporter@kernel.crashing.org>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Andre van Herk <andre.van.herk@prodrive-technologies.com>
Cc: Barry Wood <barry.wood@idt.com>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/rapidio/mport_cdev.txt     |  104 ++
 drivers/rapidio/Kconfig                  |    8 +
 drivers/rapidio/devices/Makefile         |    1 +
 drivers/rapidio/devices/rio_mport_cdev.c | 2720 ++++++++++++++++++++++++++++++
 include/linux/rio_mport_cdev.h           |  271 +++
 include/uapi/linux/Kbuild                |    1 +
 6 files changed, 3105 insertions(+)
 create mode 100644 Documentation/rapidio/mport_cdev.txt
 create mode 100644 drivers/rapidio/devices/rio_mport_cdev.c
 create mode 100644 include/linux/rio_mport_cdev.h

(limited to 'include/uapi/linux')

diff --git a/Documentation/rapidio/mport_cdev.txt b/Documentation/rapidio/mport_cdev.txt
new file mode 100644
index 000000000000..20c120d4b3b8
--- /dev/null
+++ b/Documentation/rapidio/mport_cdev.txt
@@ -0,0 +1,104 @@
+RapidIO subsystem mport character device driver (rio_mport_cdev.c)
+==================================================================
+
+Version History:
+----------------
+  1.0.0 - Initial driver release.
+
+==================================================================
+
+I. Overview
+
+This device driver is the result of collaboration within the RapidIO.org
+Software Task Group (STG) between Texas Instruments, Freescale,
+Prodrive Technologies, Nokia Networks, BAE and IDT.  Additional input was
+received from other members of RapidIO.org. The objective was to create a
+character mode driver interface which exposes the capabilities of RapidIO
+devices directly to applications, in a manner that allows the numerous and
+varied RapidIO implementations to interoperate.
+
+This driver (MPORT_CDEV) provides access to basic RapidIO subsystem operations
+for user-space applications. Most of RapidIO operations are supported through
+'ioctl' system calls.
+
+When loaded this device driver creates filesystem nodes named rio_mportX in /dev
+directory for each registered RapidIO mport device. 'X' in the node name matches
+to unique port ID assigned to each local mport device.
+
+Using available set of ioctl commands user-space applications can perform
+following RapidIO bus and subsystem operations:
+
+- Reads and writes from/to configuration registers of mport devices
+    (RIO_MPORT_MAINT_READ_LOCAL/RIO_MPORT_MAINT_WRITE_LOCAL)
+- Reads and writes from/to configuration registers of remote RapidIO devices.
+  This operations are defined as RapidIO Maintenance reads/writes in RIO spec.
+    (RIO_MPORT_MAINT_READ_REMOTE/RIO_MPORT_MAINT_WRITE_REMOTE)
+- Set RapidIO Destination ID for mport devices (RIO_MPORT_MAINT_HDID_SET)
+- Set RapidIO Component Tag for mport devices (RIO_MPORT_MAINT_COMPTAG_SET)
+- Query logical index of mport devices (RIO_MPORT_MAINT_PORT_IDX_GET)
+- Query capabilities and RapidIO link configuration of mport devices
+    (RIO_MPORT_GET_PROPERTIES)
+- Enable/Disable reporting of RapidIO doorbell events to user-space applications
+    (RIO_ENABLE_DOORBELL_RANGE/RIO_DISABLE_DOORBELL_RANGE)
+- Enable/Disable reporting of RIO port-write events to user-space applications
+    (RIO_ENABLE_PORTWRITE_RANGE/RIO_DISABLE_PORTWRITE_RANGE)
+- Query/Control type of events reported through this driver: doorbells,
+  port-writes or both (RIO_SET_EVENT_MASK/RIO_GET_EVENT_MASK)
+- Configure/Map mport's outbound requests window(s) for specific size,
+  RapidIO destination ID, hopcount and request type
+    (RIO_MAP_OUTBOUND/RIO_UNMAP_OUTBOUND)
+- Configure/Map mport's inbound requests window(s) for specific size,
+  RapidIO base address and local memory base address
+    (RIO_MAP_INBOUND/RIO_UNMAP_INBOUND)
+- Allocate/Free contiguous DMA coherent memory buffer for DMA data transfers
+  to/from remote RapidIO devices (RIO_ALLOC_DMA/RIO_FREE_DMA)
+- Initiate DMA data transfers to/from remote RapidIO devices (RIO_TRANSFER).
+  Supports blocking, asynchronous and posted (a.k.a 'fire-and-forget') data
+  transfer modes.
+- Check/Wait for completion of asynchronous DMA data transfer
+    (RIO_WAIT_FOR_ASYNC)
+- Manage device objects supported by RapidIO subsystem (RIO_DEV_ADD/RIO_DEV_DEL).
+  This allows implementation of various RapidIO fabric enumeration algorithms
+  as user-space applications while using remaining functionality provided by
+  kernel RapidIO subsystem.
+
+II. Hardware Compatibility
+
+This device driver uses standard interfaces defined by kernel RapidIO subsystem
+and therefore it can be used with any mport device driver registered by RapidIO
+subsystem with limitations set by available mport implementation.
+
+At this moment the most common limitation is availability of RapidIO-specific
+DMA engine framework for specific mport device. Users should verify available
+functionality of their platform when planning to use this driver:
+
+- IDT Tsi721 PCIe-to-RapidIO bridge device and its mport device driver are fully
+  compatible with this driver.
+- Freescale SoCs 'fsl_rio' mport driver does not have implementation for RapidIO
+  specific DMA engine support and therefore DMA data transfers mport_cdev driver
+  are not available.
+
+III. Module parameters
+
+- 'dbg_level' - This parameter allows to control amount of debug information
+        generated by this device driver. This parameter is formed by set of
+        This parameter can be changed bit masks that correspond to the specific
+        functional block.
+        For mask definitions see 'drivers/rapidio/devices/rio_mport_cdev.c'
+        This parameter can be changed dynamically.
+        Use CONFIG_RAPIDIO_DEBUG=y to enable debug output at the top level.
+
+IV. Known problems
+
+  None.
+
+V. User-space Applications and API
+
+API library and applications that use this device driver are available from
+RapidIO.org.
+
+VI. TODO List
+
+- Add support for sending/receiving "raw" RapidIO messaging packets.
+- Add memory mapped DMA data transfers as an option when RapidIO-specific DMA
+  is not available.
diff --git a/drivers/rapidio/Kconfig b/drivers/rapidio/Kconfig
index 3e3be57e9a1a..b5a10d3c92c7 100644
--- a/drivers/rapidio/Kconfig
+++ b/drivers/rapidio/Kconfig
@@ -67,6 +67,14 @@ config RAPIDIO_ENUM_BASIC
 
 endchoice
 
+config RAPIDIO_MPORT_CDEV
+	tristate "RapidIO /dev mport device driver"
+	depends on RAPIDIO
+	help
+	  This option includes generic RapidIO mport device driver which
+	  allows to user space applications to perform RapidIO-specific
+	  operations through selected RapidIO mport.
+
 menu "RapidIO Switch drivers"
 	depends on RAPIDIO
 
diff --git a/drivers/rapidio/devices/Makefile b/drivers/rapidio/devices/Makefile
index 9432c494cf57..927dbf89592b 100644
--- a/drivers/rapidio/devices/Makefile
+++ b/drivers/rapidio/devices/Makefile
@@ -5,3 +5,4 @@
 obj-$(CONFIG_RAPIDIO_TSI721)	+= tsi721_mport.o
 tsi721_mport-y			:= tsi721.o
 tsi721_mport-$(CONFIG_RAPIDIO_DMA_ENGINE) += tsi721_dma.o
+obj-$(CONFIG_RAPIDIO_MPORT_CDEV) += rio_mport_cdev.o
diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c
new file mode 100644
index 000000000000..9607bc826460
--- /dev/null
+++ b/drivers/rapidio/devices/rio_mport_cdev.c
@@ -0,0 +1,2720 @@
+/*
+ * RapidIO mport character device
+ *
+ * Copyright 2014-2015 Integrated Device Technology, Inc.
+ *    Alexandre Bounine <alexandre.bounine@idt.com>
+ * Copyright 2014-2015 Prodrive Technologies
+ *    Andre van Herk <andre.van.herk@prodrive-technologies.com>
+ *    Jerry Jacobs <jerry.jacobs@prodrive-technologies.com>
+ * Copyright (C) 2014 Texas Instruments Incorporated
+ *    Aurelien Jacquiot <a-jacquiot@ti.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/cdev.h>
+#include <linux/ioctl.h>
+#include <linux/uaccess.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+#include <linux/err.h>
+#include <linux/net.h>
+#include <linux/poll.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+#include <linux/kfifo.h>
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mman.h>
+
+#include <linux/dma-mapping.h>
+#ifdef CONFIG_RAPIDIO_DMA_ENGINE
+#include <linux/dmaengine.h>
+#endif
+
+#include <linux/rio.h>
+#include <linux/rio_ids.h>
+#include <linux/rio_drv.h>
+#include <linux/rio_mport_cdev.h>
+
+#include "../rio.h"
+
+#define DRV_NAME	"rio_mport"
+#define DRV_PREFIX	DRV_NAME ": "
+#define DEV_NAME	"rio_mport"
+#define DRV_VERSION     "1.0.0"
+
+/* Debug output filtering masks */
+enum {
+	DBG_NONE	= 0,
+	DBG_INIT	= BIT(0), /* driver init */
+	DBG_EXIT	= BIT(1), /* driver exit */
+	DBG_MPORT	= BIT(2), /* mport add/remove */
+	DBG_RDEV	= BIT(3), /* RapidIO device add/remove */
+	DBG_DMA		= BIT(4), /* DMA transfer messages */
+	DBG_MMAP	= BIT(5), /* mapping messages */
+	DBG_IBW		= BIT(6), /* inbound window */
+	DBG_EVENT	= BIT(7), /* event handling messages */
+	DBG_OBW		= BIT(8), /* outbound window messages */
+	DBG_DBELL	= BIT(9), /* doorbell messages */
+	DBG_ALL		= ~0,
+};
+
+#ifdef DEBUG
+#define rmcd_debug(level, fmt, arg...)		\
+	do {					\
+		if (DBG_##level & dbg_level)	\
+			pr_debug(DRV_PREFIX "%s: " fmt "\n", __func__, ##arg); \
+	} while (0)
+#else
+#define rmcd_debug(level, fmt, arg...) \
+		no_printk(KERN_DEBUG pr_fmt(DRV_PREFIX fmt "\n"), ##arg)
+#endif
+
+#define rmcd_warn(fmt, arg...) \
+	pr_warn(DRV_PREFIX "%s WARNING " fmt "\n", __func__, ##arg)
+
+#define rmcd_error(fmt, arg...) \
+	pr_err(DRV_PREFIX "%s ERROR " fmt "\n", __func__, ##arg)
+
+MODULE_AUTHOR("Jerry Jacobs <jerry.jacobs@prodrive-technologies.com>");
+MODULE_AUTHOR("Aurelien Jacquiot <a-jacquiot@ti.com>");
+MODULE_AUTHOR("Alexandre Bounine <alexandre.bounine@idt.com>");
+MODULE_AUTHOR("Andre van Herk <andre.van.herk@prodrive-technologies.com>");
+MODULE_DESCRIPTION("RapidIO mport character device driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static int dma_timeout = 3000; /* DMA transfer timeout in msec */
+module_param(dma_timeout, int, S_IRUGO);
+MODULE_PARM_DESC(dma_timeout, "DMA Transfer Timeout in msec (default: 3000)");
+
+#ifdef DEBUG
+static u32 dbg_level = DBG_NONE;
+module_param(dbg_level, uint, S_IWUSR | S_IWGRP | S_IRUGO);
+MODULE_PARM_DESC(dbg_level, "Debugging output level (default 0 = none)");
+#endif
+
+/*
+ * An internal DMA coherent buffer
+ */
+struct mport_dma_buf {
+	void		*ib_base;
+	dma_addr_t	ib_phys;
+	u32		ib_size;
+	u64		ib_rio_base;
+	bool		ib_map;
+	struct file	*filp;
+};
+
+/*
+ * Internal memory mapping structure
+ */
+enum rio_mport_map_dir {
+	MAP_INBOUND,
+	MAP_OUTBOUND,
+	MAP_DMA,
+};
+
+struct rio_mport_mapping {
+	struct list_head node;
+	struct mport_dev *md;
+	enum rio_mport_map_dir dir;
+	u32 rioid;
+	u64 rio_addr;
+	dma_addr_t phys_addr; /* for mmap */
+	void *virt_addr; /* kernel address, for dma_free_coherent */
+	u64 size;
+	struct kref ref; /* refcount of vmas sharing the mapping */
+	struct file *filp;
+};
+
+struct rio_mport_dma_map {
+	int valid;
+	uint64_t length;
+	void *vaddr;
+	dma_addr_t paddr;
+};
+
+#define MPORT_MAX_DMA_BUFS	16
+#define MPORT_EVENT_DEPTH	10
+
+/*
+ * mport_dev  driver-specific structure that represents mport device
+ * @active    mport device status flag
+ * @node      list node to maintain list of registered mports
+ * @cdev      character device
+ * @dev       associated device object
+ * @mport     associated subsystem's master port device object
+ * @buf_mutex lock for buffer handling
+ * @file_mutex - lock for open files list
+ * @file_list  - list of open files on given mport
+ * @properties properties of this mport
+ * @portwrites queue of inbound portwrites
+ * @pw_lock    lock for port write queue
+ * @mappings   queue for memory mappings
+ * @dma_chan   DMA channels associated with this device
+ * @dma_ref:
+ * @comp:
+ */
+struct mport_dev {
+	atomic_t		active;
+	struct list_head	node;
+	struct cdev		cdev;
+	struct device		dev;
+	struct rio_mport	*mport;
+	struct mutex		buf_mutex;
+	struct mutex		file_mutex;
+	struct list_head	file_list;
+	struct rio_mport_properties	properties;
+	struct list_head		doorbells;
+	spinlock_t			db_lock;
+	struct list_head		portwrites;
+	spinlock_t			pw_lock;
+	struct list_head	mappings;
+#ifdef CONFIG_RAPIDIO_DMA_ENGINE
+	struct dma_chan *dma_chan;
+	struct kref	dma_ref;
+	struct completion comp;
+#endif
+};
+
+/*
+ * mport_cdev_priv - data structure specific to individual file object
+ *                   associated with an open device
+ * @md    master port character device object
+ * @async_queue - asynchronous notification queue
+ * @list - file objects tracking list
+ * @db_filters    inbound doorbell filters for this descriptor
+ * @pw_filters    portwrite filters for this descriptor
+ * @event_fifo    event fifo for this descriptor
+ * @event_rx_wait wait queue for this descriptor
+ * @fifo_lock     lock for event_fifo
+ * @event_mask    event mask for this descriptor
+ * @dmach DMA engine channel allocated for specific file object
+ */
+struct mport_cdev_priv {
+	struct mport_dev	*md;
+	struct fasync_struct	*async_queue;
+	struct list_head	list;
+	struct list_head	db_filters;
+	struct list_head        pw_filters;
+	struct kfifo            event_fifo;
+	wait_queue_head_t       event_rx_wait;
+	spinlock_t              fifo_lock;
+	unsigned int            event_mask; /* RIO_DOORBELL, RIO_PORTWRITE */
+#ifdef CONFIG_RAPIDIO_DMA_ENGINE
+	struct dma_chan		*dmach;
+	struct list_head	async_list;
+	struct list_head	pend_list;
+	spinlock_t              req_lock;
+	struct mutex		dma_lock;
+	struct kref		dma_ref;
+	struct completion	comp;
+#endif
+};
+
+/*
+ * rio_mport_pw_filter - structure to describe a portwrite filter
+ * md_node   node in mport device's list
+ * priv_node node in private file object's list
+ * priv      reference to private data
+ * filter    actual portwrite filter
+ */
+struct rio_mport_pw_filter {
+	struct list_head md_node;
+	struct list_head priv_node;
+	struct mport_cdev_priv *priv;
+	struct rio_pw_filter filter;
+};
+
+/*
+ * rio_mport_db_filter - structure to describe a doorbell filter
+ * @data_node reference to device node
+ * @priv_node node in private data
+ * @priv      reference to private data
+ * @filter    actual doorbell filter
+ */
+struct rio_mport_db_filter {
+	struct list_head data_node;
+	struct list_head priv_node;
+	struct mport_cdev_priv *priv;
+	struct rio_doorbell_filter filter;
+};
+
+static LIST_HEAD(mport_devs);
+static DEFINE_MUTEX(mport_devs_lock);
+
+#if (0) /* used by commented out portion of poll function : FIXME */
+static DECLARE_WAIT_QUEUE_HEAD(mport_cdev_wait);
+#endif
+
+static struct class *dev_class;
+static dev_t dev_number;
+
+static struct workqueue_struct *dma_wq;
+
+static void mport_release_mapping(struct kref *ref);
+
+static int rio_mport_maint_rd(struct mport_cdev_priv *priv, void __user *arg,
+			      int local)
+{
+	struct rio_mport *mport = priv->md->mport;
+	struct rio_mport_maint_io maint_io;
+	u32 *buffer;
+	u32 offset;
+	size_t length;
+	int ret, i;
+
+	if (unlikely(copy_from_user(&maint_io, arg, sizeof(maint_io))))
+		return -EFAULT;
+
+	if ((maint_io.offset % 4) ||
+	    (maint_io.length == 0) || (maint_io.length % 4))
+		return -EINVAL;
+
+	buffer = vmalloc(maint_io.length);
+	if (buffer == NULL)
+		return -ENOMEM;
+	length = maint_io.length/sizeof(u32);
+	offset = maint_io.offset;
+
+	for (i = 0; i < length; i++) {
+		if (local)
+			ret = __rio_local_read_config_32(mport,
+				offset, &buffer[i]);
+		else
+			ret = rio_mport_read_config_32(mport, maint_io.rioid,
+				maint_io.hopcount, offset, &buffer[i]);
+		if (ret)
+			goto out;
+
+		offset += 4;
+	}
+
+	if (unlikely(copy_to_user(maint_io.buffer, buffer, maint_io.length)))
+		ret = -EFAULT;
+out:
+	vfree(buffer);
+	return ret;
+}
+
+static int rio_mport_maint_wr(struct mport_cdev_priv *priv, void __user *arg,
+			      int local)
+{
+	struct rio_mport *mport = priv->md->mport;
+	struct rio_mport_maint_io maint_io;
+	u32 *buffer;
+	u32 offset;
+	size_t length;
+	int ret = -EINVAL, i;
+
+	if (unlikely(copy_from_user(&maint_io, arg, sizeof(maint_io))))
+		return -EFAULT;
+
+	if ((maint_io.offset % 4) ||
+	    (maint_io.length == 0) || (maint_io.length % 4))
+		return -EINVAL;
+
+	buffer = vmalloc(maint_io.length);
+	if (buffer == NULL)
+		return -ENOMEM;
+	length = maint_io.length;
+
+	if (unlikely(copy_from_user(buffer, maint_io.buffer, length))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	offset = maint_io.offset;
+	length /= sizeof(u32);
+
+	for (i = 0; i < length; i++) {
+		if (local)
+			ret = __rio_local_write_config_32(mport,
+							  offset, buffer[i]);
+		else
+			ret = rio_mport_write_config_32(mport, maint_io.rioid,
+							maint_io.hopcount,
+							offset, buffer[i]);
+		if (ret)
+			goto out;
+
+		offset += 4;
+	}
+
+out:
+	vfree(buffer);
+	return ret;
+}
+
+
+/*
+ * Inbound/outbound memory mapping functions
+ */
+static int
+rio_mport_create_outbound_mapping(struct mport_dev *md, struct file *filp,
+				  u32 rioid, u64 raddr, u32 size,
+				  dma_addr_t *paddr)
+{
+	struct rio_mport *mport = md->mport;
+	struct rio_mport_mapping *map;
+	int ret;
+
+	rmcd_debug(OBW, "did=%d ra=0x%llx sz=0x%x", rioid, raddr, size);
+
+	map = kzalloc(sizeof(struct rio_mport_mapping), GFP_KERNEL);
+	if (map == NULL)
+		return -ENOMEM;
+
+	ret = rio_map_outb_region(mport, rioid, raddr, size, 0, paddr);
+	if (ret < 0)
+		goto err_map_outb;
+
+	map->dir = MAP_OUTBOUND;
+	map->rioid = rioid;
+	map->rio_addr = raddr;
+	map->size = size;
+	map->phys_addr = *paddr;
+	map->filp = filp;
+	map->md = md;
+	kref_init(&map->ref);
+	list_add_tail(&map->node, &md->mappings);
+	return 0;
+err_map_outb:
+	kfree(map);
+	return ret;
+}
+
+static int
+rio_mport_get_outbound_mapping(struct mport_dev *md, struct file *filp,
+			       u32 rioid, u64 raddr, u32 size,
+			       dma_addr_t *paddr)
+{
+	struct rio_mport_mapping *map;
+	int err = -ENOMEM;
+
+	mutex_lock(&md->buf_mutex);
+	list_for_each_entry(map, &md->mappings, node) {
+		if (map->dir != MAP_OUTBOUND)
+			continue;
+		if (rioid == map->rioid &&
+		    raddr == map->rio_addr && size == map->size) {
+			*paddr = map->phys_addr;
+			err = 0;
+			break;
+		} else if (rioid == map->rioid &&
+			   raddr < (map->rio_addr + map->size - 1) &&
+			   (raddr + size) > map->rio_addr) {
+			err = -EBUSY;
+			break;
+		}
+	}
+
+	/* If not found, create new */
+	if (err == -ENOMEM)
+		err = rio_mport_create_outbound_mapping(md, filp, rioid, raddr,
+						size, paddr);
+	mutex_unlock(&md->buf_mutex);
+	return err;
+}
+
+static int rio_mport_obw_map(struct file *filp, void __user *arg)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+	struct mport_dev *data = priv->md;
+	struct rio_mmap map;
+	dma_addr_t paddr;
+	int ret;
+
+	if (unlikely(copy_from_user(&map, arg, sizeof(struct rio_mmap))))
+		return -EFAULT;
+
+	rmcd_debug(OBW, "did=%d ra=0x%llx sz=0x%llx",
+		   map.rioid, map.rio_addr, map.length);
+
+	ret = rio_mport_get_outbound_mapping(data, filp, map.rioid,
+					     map.rio_addr, map.length, &paddr);
+	if (ret < 0) {
+		rmcd_error("Failed to set OBW err= %d", ret);
+		return ret;
+	}
+
+	map.handle = paddr;
+
+	if (unlikely(copy_to_user(arg, &map, sizeof(struct rio_mmap))))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ * rio_mport_obw_free() - unmap an OutBound Window from RapidIO address space
+ *
+ * @priv: driver private data
+ * @arg:  buffer handle returned by allocation routine
+ */
+static int rio_mport_obw_free(struct file *filp, void __user *arg)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+	struct mport_dev *md = priv->md;
+	u64 handle;
+	struct rio_mport_mapping *map, *_map;
+
+	if (!md->mport->ops->unmap_outb)
+		return -EPROTONOSUPPORT;
+
+	if (copy_from_user(&handle, arg, sizeof(u64)))
+		return -EFAULT;
+
+	rmcd_debug(OBW, "h=0x%llx", handle);
+
+	mutex_lock(&md->buf_mutex);
+	list_for_each_entry_safe(map, _map, &md->mappings, node) {
+		if (map->dir == MAP_OUTBOUND && map->phys_addr == handle) {
+			if (map->filp == filp) {
+				rmcd_debug(OBW, "kref_put h=0x%llx", handle);
+				map->filp = NULL;
+				kref_put(&map->ref, mport_release_mapping);
+			}
+			break;
+		}
+	}
+	mutex_unlock(&md->buf_mutex);
+
+	return 0;
+}
+
+/*
+ * maint_hdid_set() - Set the host Device ID
+ * @priv: driver private data
+ * @arg:	Device Id
+ */
+static int maint_hdid_set(struct mport_cdev_priv *priv, void __user *arg)
+{
+	struct mport_dev *md = priv->md;
+	uint16_t hdid;
+
+	if (copy_from_user(&hdid, arg, sizeof(uint16_t)))
+		return -EFAULT;
+
+	md->mport->host_deviceid = hdid;
+	md->properties.hdid = hdid;
+	rio_local_set_device_id(md->mport, hdid);
+
+	rmcd_debug(MPORT, "Set host device Id to %d", hdid);
+
+	return 0;
+}
+
+/*
+ * maint_comptag_set() - Set the host Component Tag
+ * @priv: driver private data
+ * @arg:	Component Tag
+ */
+static int maint_comptag_set(struct mport_cdev_priv *priv, void __user *arg)
+{
+	struct mport_dev *md = priv->md;
+	uint32_t comptag;
+
+	if (copy_from_user(&comptag, arg, sizeof(uint32_t)))
+		return -EFAULT;
+
+	rio_local_write_config_32(md->mport, RIO_COMPONENT_TAG_CSR, comptag);
+
+	rmcd_debug(MPORT, "Set host Component Tag to %d", comptag);
+
+	return 0;
+}
+
+#ifdef CONFIG_RAPIDIO_DMA_ENGINE
+
+struct mport_dma_req {
+	struct list_head node;
+	struct file *filp;
+	struct mport_cdev_priv *priv;
+	enum rio_transfer_sync sync;
+	struct sg_table sgt;
+	struct page **page_list;
+	unsigned int nr_pages;
+	struct rio_mport_mapping *map;
+	struct dma_chan *dmach;
+	enum dma_data_direction dir;
+	dma_cookie_t cookie;
+	enum dma_status	status;
+	struct completion req_comp;
+};
+
+struct mport_faf_work {
+	struct work_struct work;
+	struct mport_dma_req *req;
+};
+
+static void mport_release_def_dma(struct kref *dma_ref)
+{
+	struct mport_dev *md =
+			container_of(dma_ref, struct mport_dev, dma_ref);
+
+	rmcd_debug(EXIT, "DMA_%d", md->dma_chan->chan_id);
+	rio_release_dma(md->dma_chan);
+	md->dma_chan = NULL;
+}
+
+static void mport_release_dma(struct kref *dma_ref)
+{
+	struct mport_cdev_priv *priv =
+			container_of(dma_ref, struct mport_cdev_priv, dma_ref);
+
+	rmcd_debug(EXIT, "DMA_%d", priv->dmach->chan_id);
+	complete(&priv->comp);
+}
+
+static void dma_req_free(struct mport_dma_req *req)
+{
+	struct mport_cdev_priv *priv = req->priv;
+	unsigned int i;
+
+	dma_unmap_sg(req->dmach->device->dev,
+		     req->sgt.sgl, req->sgt.nents, req->dir);
+	sg_free_table(&req->sgt);
+	if (req->page_list) {
+		for (i = 0; i < req->nr_pages; i++)
+			put_page(req->page_list[i]);
+		kfree(req->page_list);
+	}
+
+	if (req->map) {
+		mutex_lock(&req->map->md->buf_mutex);
+		kref_put(&req->map->ref, mport_release_mapping);
+		mutex_unlock(&req->map->md->buf_mutex);
+	}
+
+	kref_put(&priv->dma_ref, mport_release_dma);
+
+	kfree(req);
+}
+
+static void dma_xfer_callback(void *param)
+{
+	struct mport_dma_req *req = (struct mport_dma_req *)param;
+	struct mport_cdev_priv *priv = req->priv;
+
+	req->status = dma_async_is_tx_complete(priv->dmach, req->cookie,
+					       NULL, NULL);
+	complete(&req->req_comp);
+}
+
+static void dma_faf_cleanup(struct work_struct *_work)
+{
+	struct mport_faf_work *work = container_of(_work,
+						struct mport_faf_work, work);
+	struct mport_dma_req *req = work->req;
+
+	dma_req_free(req);
+	kfree(work);
+}
+
+static void dma_faf_callback(void *param)
+{
+	struct mport_dma_req *req = (struct mport_dma_req *)param;
+	struct mport_faf_work *work;
+
+	work = kmalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return;
+
+	INIT_WORK(&work->work, dma_faf_cleanup);
+	work->req = req;
+	queue_work(dma_wq, &work->work);
+}
+
+/*
+ * prep_dma_xfer() - Configure and send request to DMAengine to prepare DMA
+ *                   transfer object.
+ * Returns pointer to DMA transaction descriptor allocated by DMA driver on
+ * success or ERR_PTR (and/or NULL) if failed. Caller must check returned
+ * non-NULL pointer using IS_ERR macro.
+ */
+static struct dma_async_tx_descriptor
+*prep_dma_xfer(struct dma_chan *chan, struct rio_transfer_io *transfer,
+	struct sg_table *sgt, int nents, enum dma_transfer_direction dir,
+	enum dma_ctrl_flags flags)
+{
+	struct rio_dma_data tx_data;
+
+	tx_data.sg = sgt->sgl;
+	tx_data.sg_len = nents;
+	tx_data.rio_addr_u = 0;
+	tx_data.rio_addr = transfer->rio_addr;
+	if (dir == DMA_MEM_TO_DEV) {
+		switch (transfer->method) {
+		case RIO_EXCHANGE_NWRITE:
+			tx_data.wr_type = RDW_ALL_NWRITE;
+			break;
+		case RIO_EXCHANGE_NWRITE_R_ALL:
+			tx_data.wr_type = RDW_ALL_NWRITE_R;
+			break;
+		case RIO_EXCHANGE_NWRITE_R:
+			tx_data.wr_type = RDW_LAST_NWRITE_R;
+			break;
+		case RIO_EXCHANGE_DEFAULT:
+			tx_data.wr_type = RDW_DEFAULT;
+			break;
+		default:
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	return rio_dma_prep_xfer(chan, transfer->rioid, &tx_data, dir, flags);
+}
+
+/* Request DMA channel associated with this mport device.
+ * Try to request DMA channel for every new process that opened given
+ * mport. If a new DMA channel is not available use default channel
+ * which is the first DMA channel opened on mport device.
+ */
+static int get_dma_channel(struct mport_cdev_priv *priv)
+{
+	mutex_lock(&priv->dma_lock);
+	if (!priv->dmach) {
+		priv->dmach = rio_request_mport_dma(priv->md->mport);
+		if (!priv->dmach) {
+			/* Use default DMA channel if available */
+			if (priv->md->dma_chan) {
+				priv->dmach = priv->md->dma_chan;
+				kref_get(&priv->md->dma_ref);
+			} else {
+				rmcd_error("Failed to get DMA channel");
+				mutex_unlock(&priv->dma_lock);
+				return -ENODEV;
+			}
+		} else if (!priv->md->dma_chan) {
+			/* Register default DMA channel if we do not have one */
+			priv->md->dma_chan = priv->dmach;
+			kref_init(&priv->md->dma_ref);
+			rmcd_debug(DMA, "Register DMA_chan %d as default",
+				   priv->dmach->chan_id);
+		}
+
+		kref_init(&priv->dma_ref);
+		init_completion(&priv->comp);
+	}
+
+	kref_get(&priv->dma_ref);
+	mutex_unlock(&priv->dma_lock);
+	return 0;
+}
+
+static void put_dma_channel(struct mport_cdev_priv *priv)
+{
+	kref_put(&priv->dma_ref, mport_release_dma);
+}
+
+/*
+ * DMA transfer functions
+ */
+static int do_dma_request(struct mport_dma_req *req,
+			  struct rio_transfer_io *xfer,
+			  enum rio_transfer_sync sync, int nents)
+{
+	struct mport_cdev_priv *priv;
+	struct sg_table *sgt;
+	struct dma_chan *chan;
+	struct dma_async_tx_descriptor *tx;
+	dma_cookie_t cookie;
+	unsigned long tmo = msecs_to_jiffies(dma_timeout);
+	enum dma_transfer_direction dir;
+	long wret;
+	int ret = 0;
+
+	priv = req->priv;
+	sgt = &req->sgt;
+
+	chan = priv->dmach;
+	dir = (req->dir == DMA_FROM_DEVICE) ? DMA_DEV_TO_MEM : DMA_MEM_TO_DEV;
+
+	rmcd_debug(DMA, "%s(%d) uses %s for DMA_%s",
+		   current->comm, task_pid_nr(current),
+		   dev_name(&chan->dev->device),
+		   (dir == DMA_DEV_TO_MEM)?"READ":"WRITE");
+
+	/* Initialize DMA transaction request */
+	tx = prep_dma_xfer(chan, xfer, sgt, nents, dir,
+			   DMA_CTRL_ACK | DMA_PREP_INTERRUPT);
+
+	if (!tx) {
+		rmcd_debug(DMA, "prep error for %s A:0x%llx L:0x%llx",
+			(dir == DMA_DEV_TO_MEM)?"READ":"WRITE",
+			xfer->rio_addr, xfer->length);
+		ret = -EIO;
+		goto err_out;
+	} else if (IS_ERR(tx)) {
+		ret = PTR_ERR(tx);
+		rmcd_debug(DMA, "prep error %d for %s A:0x%llx L:0x%llx", ret,
+			(dir == DMA_DEV_TO_MEM)?"READ":"WRITE",
+			xfer->rio_addr, xfer->length);
+		goto err_out;
+	}
+
+	if (sync == RIO_TRANSFER_FAF)
+		tx->callback = dma_faf_callback;
+	else
+		tx->callback = dma_xfer_callback;
+	tx->callback_param = req;
+
+	req->dmach = chan;
+	req->sync = sync;
+	req->status = DMA_IN_PROGRESS;
+	init_completion(&req->req_comp);
+
+	cookie = dmaengine_submit(tx);
+	req->cookie = cookie;
+
+	rmcd_debug(DMA, "pid=%d DMA_%s tx_cookie = %d", task_pid_nr(current),
+		   (dir == DMA_DEV_TO_MEM)?"READ":"WRITE", cookie);
+
+	if (dma_submit_error(cookie)) {
+		rmcd_error("submit err=%d (addr:0x%llx len:0x%llx)",
+			   cookie, xfer->rio_addr, xfer->length);
+		ret = -EIO;
+		goto err_out;
+	}
+
+	dma_async_issue_pending(chan);
+
+	if (sync == RIO_TRANSFER_ASYNC) {
+		spin_lock(&priv->req_lock);
+		list_add_tail(&req->node, &priv->async_list);
+		spin_unlock(&priv->req_lock);
+		return cookie;
+	} else if (sync == RIO_TRANSFER_FAF)
+		return 0;
+
+	wret = wait_for_completion_interruptible_timeout(&req->req_comp, tmo);
+
+	if (wret == 0) {
+		/* Timeout on wait occurred */
+		rmcd_error("%s(%d) timed out waiting for DMA_%s %d",
+		       current->comm, task_pid_nr(current),
+		       (dir == DMA_DEV_TO_MEM)?"READ":"WRITE", cookie);
+		return -ETIMEDOUT;
+	} else if (wret == -ERESTARTSYS) {
+		/* Wait_for_completion was interrupted by a signal but DMA may
+		 * be in progress
+		 */
+		rmcd_error("%s(%d) wait for DMA_%s %d was interrupted",
+			current->comm, task_pid_nr(current),
+			(dir == DMA_DEV_TO_MEM)?"READ":"WRITE", cookie);
+		return -EINTR;
+	}
+
+	if (req->status != DMA_COMPLETE) {
+		/* DMA transaction completion was signaled with error */
+		rmcd_error("%s(%d) DMA_%s %d completed with status %d (ret=%d)",
+			current->comm, task_pid_nr(current),
+			(dir == DMA_DEV_TO_MEM)?"READ":"WRITE",
+			cookie, req->status, ret);
+		ret = -EIO;
+	}
+
+err_out:
+	return ret;
+}
+
+/*
+ * rio_dma_transfer() - Perform RapidIO DMA data transfer to/from
+ *                      the remote RapidIO device
+ * @filp: file pointer associated with the call
+ * @transfer_mode: DMA transfer mode
+ * @sync: synchronization mode
+ * @dir: DMA transfer direction (DMA_MEM_TO_DEV = write OR
+ *                               DMA_DEV_TO_MEM = read)
+ * @xfer: data transfer descriptor structure
+ */
+static int
+rio_dma_transfer(struct file *filp, uint32_t transfer_mode,
+		 enum rio_transfer_sync sync, enum dma_data_direction dir,
+		 struct rio_transfer_io *xfer)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+	unsigned long nr_pages = 0;
+	struct page **page_list = NULL;
+	struct mport_dma_req *req;
+	struct mport_dev *md = priv->md;
+	struct dma_chan *chan;
+	int i, ret;
+	int nents;
+
+	if (xfer->length == 0)
+		return -EINVAL;
+	req = kzalloc(sizeof(*req), GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	ret = get_dma_channel(priv);
+	if (ret) {
+		kfree(req);
+		return ret;
+	}
+
+	/*
+	 * If parameter loc_addr != NULL, we are transferring data from/to
+	 * data buffer allocated in user-space: lock in memory user-space
+	 * buffer pages and build an SG table for DMA transfer request
+	 *
+	 * Otherwise (loc_addr == NULL) contiguous kernel-space buffer is
+	 * used for DMA data transfers: build single entry SG table using
+	 * offset within the internal buffer specified by handle parameter.
+	 */
+	if (xfer->loc_addr) {
+		unsigned long offset;
+		long pinned;
+
+		offset = (unsigned long)xfer->loc_addr & ~PAGE_MASK;
+		nr_pages = PAGE_ALIGN(xfer->length + offset) >> PAGE_SHIFT;
+
+		page_list = kmalloc_array(nr_pages,
+					  sizeof(*page_list), GFP_KERNEL);
+		if (page_list == NULL) {
+			ret = -ENOMEM;
+			goto err_req;
+		}
+
+		down_read(&current->mm->mmap_sem);
+		pinned = get_user_pages(current, current->mm,
+				(unsigned long)xfer->loc_addr & PAGE_MASK,
+				nr_pages, dir == DMA_FROM_DEVICE, 0,
+				page_list, NULL);
+		up_read(&current->mm->mmap_sem);
+
+		if (pinned != nr_pages) {
+			if (pinned < 0) {
+				rmcd_error("get_user_pages err=%ld", pinned);
+				nr_pages = 0;
+			} else
+				rmcd_error("pinned %ld out of %ld pages",
+					   pinned, nr_pages);
+			ret = -EFAULT;
+			goto err_pg;
+		}
+
+		ret = sg_alloc_table_from_pages(&req->sgt, page_list, nr_pages,
+					offset, xfer->length, GFP_KERNEL);
+		if (ret) {
+			rmcd_error("sg_alloc_table failed with err=%d", ret);
+			goto err_pg;
+		}
+
+		req->page_list = page_list;
+		req->nr_pages = nr_pages;
+	} else {
+		dma_addr_t baddr;
+		struct rio_mport_mapping *map;
+
+		baddr = (dma_addr_t)xfer->handle;
+
+		mutex_lock(&md->buf_mutex);
+		list_for_each_entry(map, &md->mappings, node) {
+			if (baddr >= map->phys_addr &&
+			    baddr < (map->phys_addr + map->size)) {
+				kref_get(&map->ref);
+				req->map = map;
+				break;
+			}
+		}
+		mutex_unlock(&md->buf_mutex);
+
+		if (req->map == NULL) {
+			ret = -ENOMEM;
+			goto err_req;
+		}
+
+		if (xfer->length + xfer->offset > map->size) {
+			ret = -EINVAL;
+			goto err_req;
+		}
+
+		ret = sg_alloc_table(&req->sgt, 1, GFP_KERNEL);
+		if (unlikely(ret)) {
+			rmcd_error("sg_alloc_table failed for internal buf");
+			goto err_req;
+		}
+
+		sg_set_buf(req->sgt.sgl,
+			   map->virt_addr + (baddr - map->phys_addr) +
+				xfer->offset, xfer->length);
+	}
+
+	req->dir = dir;
+	req->filp = filp;
+	req->priv = priv;
+	chan = priv->dmach;
+
+	nents = dma_map_sg(chan->device->dev,
+			   req->sgt.sgl, req->sgt.nents, dir);
+	if (nents == -EFAULT) {
+		rmcd_error("Failed to map SG list");
+		return -EFAULT;
+	}
+
+	ret = do_dma_request(req, xfer, sync, nents);
+
+	if (ret >= 0) {
+		if (sync == RIO_TRANSFER_SYNC)
+			goto sync_out;
+		return ret; /* return ASYNC cookie */
+	}
+
+	if (ret == -ETIMEDOUT || ret == -EINTR) {
+		/*
+		 * This can happen only in case of SYNC transfer.
+		 * Do not free unfinished request structure immediately.
+		 * Place it into pending list and deal with it later
+		 */
+		spin_lock(&priv->req_lock);
+		list_add_tail(&req->node, &priv->pend_list);
+		spin_unlock(&priv->req_lock);
+		return ret;
+	}
+
+
+	rmcd_debug(DMA, "do_dma_request failed with err=%d", ret);
+sync_out:
+	dma_unmap_sg(chan->device->dev, req->sgt.sgl, req->sgt.nents, dir);
+	sg_free_table(&req->sgt);
+err_pg:
+	if (page_list) {
+		for (i = 0; i < nr_pages; i++)
+			put_page(page_list[i]);
+		kfree(page_list);
+	}
+err_req:
+	if (req->map) {
+		mutex_lock(&md->buf_mutex);
+		kref_put(&req->map->ref, mport_release_mapping);
+		mutex_unlock(&md->buf_mutex);
+	}
+	put_dma_channel(priv);
+	kfree(req);
+	return ret;
+}
+
+static int rio_mport_transfer_ioctl(struct file *filp, void __user *arg)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+	struct rio_transaction transaction;
+	struct rio_transfer_io *transfer;
+	enum dma_data_direction dir;
+	int i, ret = 0;
+
+	if (unlikely(copy_from_user(&transaction, arg, sizeof(transaction))))
+		return -EFAULT;
+
+	if (transaction.count != 1)
+		return -EINVAL;
+
+	if ((transaction.transfer_mode &
+	     priv->md->properties.transfer_mode) == 0)
+		return -ENODEV;
+
+	transfer = vmalloc(transaction.count * sizeof(struct rio_transfer_io));
+	if (!transfer)
+		return -ENOMEM;
+
+	if (unlikely(copy_from_user(transfer, transaction.block,
+	      transaction.count * sizeof(struct rio_transfer_io)))) {
+		ret = -EFAULT;
+		goto out_free;
+	}
+
+	dir = (transaction.dir == RIO_TRANSFER_DIR_READ) ?
+					DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	for (i = 0; i < transaction.count && ret == 0; i++)
+		ret = rio_dma_transfer(filp, transaction.transfer_mode,
+			transaction.sync, dir, &transfer[i]);
+
+	if (unlikely(copy_to_user(transaction.block, transfer,
+	      transaction.count * sizeof(struct rio_transfer_io))))
+		ret = -EFAULT;
+
+out_free:
+	vfree(transfer);
+
+	return ret;
+}
+
+static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg)
+{
+	struct mport_cdev_priv *priv;
+	struct mport_dev *md;
+	struct rio_async_tx_wait w_param;
+	struct mport_dma_req *req;
+	dma_cookie_t cookie;
+	unsigned long tmo;
+	long wret;
+	int found = 0;
+	int ret;
+
+	priv = (struct mport_cdev_priv *)filp->private_data;
+	md = priv->md;
+
+	if (unlikely(copy_from_user(&w_param, arg, sizeof(w_param))))
+		return -EFAULT;
+
+	cookie = w_param.token;
+	if (w_param.timeout)
+		tmo = msecs_to_jiffies(w_param.timeout);
+	else /* Use default DMA timeout */
+		tmo = msecs_to_jiffies(dma_timeout);
+
+	spin_lock(&priv->req_lock);
+	list_for_each_entry(req, &priv->async_list, node) {
+		if (req->cookie == cookie) {
+			list_del(&req->node);
+			found = 1;
+			break;
+		}
+	}
+	spin_unlock(&priv->req_lock);
+
+	if (!found)
+		return -EAGAIN;
+
+	wret = wait_for_completion_interruptible_timeout(&req->req_comp, tmo);
+
+	if (wret == 0) {
+		/* Timeout on wait occurred */
+		rmcd_error("%s(%d) timed out waiting for ASYNC DMA_%s",
+		       current->comm, task_pid_nr(current),
+		       (req->dir == DMA_FROM_DEVICE)?"READ":"WRITE");
+		ret = -ETIMEDOUT;
+		goto err_tmo;
+	} else if (wret == -ERESTARTSYS) {
+		/* Wait_for_completion was interrupted by a signal but DMA may
+		 * be still in progress
+		 */
+		rmcd_error("%s(%d) wait for ASYNC DMA_%s was interrupted",
+			current->comm, task_pid_nr(current),
+			(req->dir == DMA_FROM_DEVICE)?"READ":"WRITE");
+		ret = -EINTR;
+		goto err_tmo;
+	}
+
+	if (req->status != DMA_COMPLETE) {
+		/* DMA transaction completion signaled with transfer error */
+		rmcd_error("%s(%d) ASYNC DMA_%s completion with status %d",
+			current->comm, task_pid_nr(current),
+			(req->dir == DMA_FROM_DEVICE)?"READ":"WRITE",
+			req->status);
+		ret = -EIO;
+	} else
+		ret = 0;
+
+	if (req->status != DMA_IN_PROGRESS && req->status != DMA_PAUSED)
+		dma_req_free(req);
+
+	return ret;
+
+err_tmo:
+	/* Return request back into async queue */
+	spin_lock(&priv->req_lock);
+	list_add_tail(&req->node, &priv->async_list);
+	spin_unlock(&priv->req_lock);
+	return ret;
+}
+
+static int rio_mport_create_dma_mapping(struct mport_dev *md, struct file *filp,
+			uint64_t size, struct rio_mport_mapping **mapping)
+{
+	struct rio_mport_mapping *map;
+
+	map = kzalloc(sizeof(struct rio_mport_mapping), GFP_KERNEL);
+	if (map == NULL)
+		return -ENOMEM;
+
+	map->virt_addr = dma_alloc_coherent(md->mport->dev.parent, size,
+					    &map->phys_addr, GFP_KERNEL);
+	if (map->virt_addr == NULL) {
+		kfree(map);
+		return -ENOMEM;
+	}
+
+	map->dir = MAP_DMA;
+	map->size = size;
+	map->filp = filp;
+	map->md = md;
+	kref_init(&map->ref);
+	mutex_lock(&md->buf_mutex);
+	list_add_tail(&map->node, &md->mappings);
+	mutex_unlock(&md->buf_mutex);
+	*mapping = map;
+
+	return 0;
+}
+
+static int rio_mport_alloc_dma(struct file *filp, void __user *arg)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+	struct mport_dev *md = priv->md;
+	struct rio_dma_mem map;
+	struct rio_mport_mapping *mapping = NULL;
+	int ret;
+
+	if (unlikely(copy_from_user(&map, arg, sizeof(struct rio_dma_mem))))
+		return -EFAULT;
+
+	ret = rio_mport_create_dma_mapping(md, filp, map.length, &mapping);
+	if (ret)
+		return ret;
+
+	map.dma_handle = mapping->phys_addr;
+
+	if (unlikely(copy_to_user(arg, &map, sizeof(struct rio_dma_mem)))) {
+		mutex_lock(&md->buf_mutex);
+		kref_put(&mapping->ref, mport_release_mapping);
+		mutex_unlock(&md->buf_mutex);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+static int rio_mport_free_dma(struct file *filp, void __user *arg)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+	struct mport_dev *md = priv->md;
+	u64 handle;
+	int ret = -EFAULT;
+	struct rio_mport_mapping *map, *_map;
+
+	if (copy_from_user(&handle, arg, sizeof(u64)))
+		return -EFAULT;
+	rmcd_debug(EXIT, "filp=%p", filp);
+
+	mutex_lock(&md->buf_mutex);
+	list_for_each_entry_safe(map, _map, &md->mappings, node) {
+		if (map->dir == MAP_DMA && map->phys_addr == handle &&
+		    map->filp == filp) {
+			kref_put(&map->ref, mport_release_mapping);
+			ret = 0;
+			break;
+		}
+	}
+	mutex_unlock(&md->buf_mutex);
+
+	if (ret == -EFAULT) {
+		rmcd_debug(DMA, "ERR no matching mapping");
+		return ret;
+	}
+
+	return 0;
+}
+#else
+static int rio_mport_transfer_ioctl(struct file *filp, void *arg)
+{
+	return -ENODEV;
+}
+
+static int rio_mport_wait_for_async_dma(struct file *filp, void __user *arg)
+{
+	return -ENODEV;
+}
+
+static int rio_mport_alloc_dma(struct file *filp, void __user *arg)
+{
+	return -ENODEV;
+}
+
+static int rio_mport_free_dma(struct file *filp, void __user *arg)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_RAPIDIO_DMA_ENGINE */
+
+/*
+ * Inbound/outbound memory mapping functions
+ */
+
+static int
+rio_mport_create_inbound_mapping(struct mport_dev *md, struct file *filp,
+				u64 raddr, u32 size,
+				struct rio_mport_mapping **mapping)
+{
+	struct rio_mport *mport = md->mport;
+	struct rio_mport_mapping *map;
+	int ret;
+
+	map = kzalloc(sizeof(struct rio_mport_mapping), GFP_KERNEL);
+	if (map == NULL)
+		return -ENOMEM;
+
+	map->virt_addr = dma_alloc_coherent(mport->dev.parent, size,
+					    &map->phys_addr, GFP_KERNEL);
+	if (map->virt_addr == NULL) {
+		ret = -ENOMEM;
+		goto err_dma_alloc;
+	}
+
+	if (raddr == RIO_MAP_ANY_ADDR)
+		raddr = map->phys_addr;
+	ret = rio_map_inb_region(mport, map->phys_addr, raddr, size, 0);
+	if (ret < 0)
+		goto err_map_inb;
+
+	map->dir = MAP_INBOUND;
+	map->rio_addr = raddr;
+	map->size = size;
+	map->filp = filp;
+	map->md = md;
+	kref_init(&map->ref);
+	mutex_lock(&md->buf_mutex);
+	list_add_tail(&map->node, &md->mappings);
+	mutex_unlock(&md->buf_mutex);
+	*mapping = map;
+	return 0;
+
+err_map_inb:
+	dma_free_coherent(mport->dev.parent, size,
+			  map->virt_addr, map->phys_addr);
+err_dma_alloc:
+	kfree(map);
+	return ret;
+}
+
+static int
+rio_mport_get_inbound_mapping(struct mport_dev *md, struct file *filp,
+			      u64 raddr, u32 size,
+			      struct rio_mport_mapping **mapping)
+{
+	struct rio_mport_mapping *map;
+	int err = -ENOMEM;
+
+	if (raddr == RIO_MAP_ANY_ADDR)
+		goto get_new;
+
+	mutex_lock(&md->buf_mutex);
+	list_for_each_entry(map, &md->mappings, node) {
+		if (map->dir != MAP_INBOUND)
+			continue;
+		if (raddr == map->rio_addr && size == map->size) {
+			/* allow exact match only */
+			*mapping = map;
+			err = 0;
+			break;
+		} else if (raddr < (map->rio_addr + map->size - 1) &&
+			   (raddr + size) > map->rio_addr) {
+			err = -EBUSY;
+			break;
+		}
+	}
+	mutex_unlock(&md->buf_mutex);
+
+	if (err != -ENOMEM)
+		return err;
+get_new:
+	/* not found, create new */
+	return rio_mport_create_inbound_mapping(md, filp, raddr, size, mapping);
+}
+
+static int rio_mport_map_inbound(struct file *filp, void __user *arg)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+	struct mport_dev *md = priv->md;
+	struct rio_mmap map;
+	struct rio_mport_mapping *mapping = NULL;
+	int ret;
+
+	if (!md->mport->ops->map_inb)
+		return -EPROTONOSUPPORT;
+	if (unlikely(copy_from_user(&map, arg, sizeof(struct rio_mmap))))
+		return -EFAULT;
+
+	rmcd_debug(IBW, "%s filp=%p", dev_name(&priv->md->dev), filp);
+
+	ret = rio_mport_get_inbound_mapping(md, filp, map.rio_addr,
+					    map.length, &mapping);
+	if (ret)
+		return ret;
+
+	map.handle = mapping->phys_addr;
+	map.rio_addr = mapping->rio_addr;
+
+	if (unlikely(copy_to_user(arg, &map, sizeof(struct rio_mmap)))) {
+		/* Delete mapping if it was created by this request */
+		if (ret == 0 && mapping->filp == filp) {
+			mutex_lock(&md->buf_mutex);
+			kref_put(&mapping->ref, mport_release_mapping);
+			mutex_unlock(&md->buf_mutex);
+		}
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+/*
+ * rio_mport_inbound_free() - unmap from RapidIO address space and free
+ *                    previously allocated inbound DMA coherent buffer
+ * @priv: driver private data
+ * @arg:  buffer handle returned by allocation routine
+ */
+static int rio_mport_inbound_free(struct file *filp, void __user *arg)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+	struct mport_dev *md = priv->md;
+	u64 handle;
+	struct rio_mport_mapping *map, *_map;
+
+	rmcd_debug(IBW, "%s filp=%p", dev_name(&priv->md->dev), filp);
+
+	if (!md->mport->ops->unmap_inb)
+		return -EPROTONOSUPPORT;
+
+	if (copy_from_user(&handle, arg, sizeof(u64)))
+		return -EFAULT;
+
+	mutex_lock(&md->buf_mutex);
+	list_for_each_entry_safe(map, _map, &md->mappings, node) {
+		if (map->dir == MAP_INBOUND && map->phys_addr == handle) {
+			if (map->filp == filp) {
+				map->filp = NULL;
+				kref_put(&map->ref, mport_release_mapping);
+			}
+			break;
+		}
+	}
+	mutex_unlock(&md->buf_mutex);
+
+	return 0;
+}
+
+/*
+ * maint_port_idx_get() - Get the port index of the mport instance
+ * @priv: driver private data
+ * @arg:  port index
+ */
+static int maint_port_idx_get(struct mport_cdev_priv *priv, void __user *arg)
+{
+	struct mport_dev *md = priv->md;
+	uint32_t port_idx = md->mport->index;
+
+	rmcd_debug(MPORT, "port_index=%d", port_idx);
+
+	if (copy_to_user(arg, &port_idx, sizeof(port_idx)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int rio_mport_add_event(struct mport_cdev_priv *priv,
+			       struct rio_event *event)
+{
+	int overflow;
+
+	if (!(priv->event_mask & event->header))
+		return -EACCES;
+
+	spin_lock(&priv->fifo_lock);
+	overflow = kfifo_avail(&priv->event_fifo) < sizeof(*event)
+		|| kfifo_in(&priv->event_fifo, (unsigned char *)event,
+			sizeof(*event)) != sizeof(*event);
+	spin_unlock(&priv->fifo_lock);
+
+	wake_up_interruptible(&priv->event_rx_wait);
+
+	if (overflow) {
+		dev_warn(&priv->md->dev, DRV_NAME ": event fifo overflow\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void rio_mport_doorbell_handler(struct rio_mport *mport, void *dev_id,
+				       u16 src, u16 dst, u16 info)
+{
+	struct mport_dev *data = dev_id;
+	struct mport_cdev_priv *priv;
+	struct rio_mport_db_filter *db_filter;
+	struct rio_event event;
+	int handled;
+
+	event.header = RIO_DOORBELL;
+	event.u.doorbell.rioid = src;
+	event.u.doorbell.payload = info;
+
+	handled = 0;
+	spin_lock(&data->db_lock);
+	list_for_each_entry(db_filter, &data->doorbells, data_node) {
+		if (((db_filter->filter.rioid == 0xffffffff ||
+		      db_filter->filter.rioid == src)) &&
+		      info >= db_filter->filter.low &&
+		      info <= db_filter->filter.high) {
+			priv = db_filter->priv;
+			rio_mport_add_event(priv, &event);
+			handled = 1;
+		}
+	}
+	spin_unlock(&data->db_lock);
+
+	if (!handled)
+		dev_warn(&data->dev,
+			"%s: spurious DB received from 0x%x, info=0x%04x\n",
+			__func__, src, info);
+}
+
+static int rio_mport_add_db_filter(struct mport_cdev_priv *priv,
+				   void __user *arg)
+{
+	struct mport_dev *md = priv->md;
+	struct rio_mport_db_filter *db_filter;
+	struct rio_doorbell_filter filter;
+	unsigned long flags;
+	int ret;
+
+	if (copy_from_user(&filter, arg, sizeof(filter)))
+		return -EFAULT;
+
+	if (filter.low > filter.high)
+		return -EINVAL;
+
+	ret = rio_request_inb_dbell(md->mport, md, filter.low, filter.high,
+				    rio_mport_doorbell_handler);
+	if (ret) {
+		rmcd_error("%s failed to register IBDB, err=%d",
+			   dev_name(&md->dev), ret);
+		return ret;
+	}
+
+	db_filter = kzalloc(sizeof(*db_filter), GFP_KERNEL);
+	if (db_filter == NULL) {
+		rio_release_inb_dbell(md->mport, filter.low, filter.high);
+		return -ENOMEM;
+	}
+
+	db_filter->filter = filter;
+	db_filter->priv = priv;
+	spin_lock_irqsave(&md->db_lock, flags);
+	list_add_tail(&db_filter->priv_node, &priv->db_filters);
+	list_add_tail(&db_filter->data_node, &md->doorbells);
+	spin_unlock_irqrestore(&md->db_lock, flags);
+
+	return 0;
+}
+
+static void rio_mport_delete_db_filter(struct rio_mport_db_filter *db_filter)
+{
+	list_del(&db_filter->data_node);
+	list_del(&db_filter->priv_node);
+	kfree(db_filter);
+}
+
+static int rio_mport_remove_db_filter(struct mport_cdev_priv *priv,
+				      void __user *arg)
+{
+	struct rio_mport_db_filter *db_filter;
+	struct rio_doorbell_filter filter;
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	if (copy_from_user(&filter, arg, sizeof(filter)))
+		return -EFAULT;
+
+	spin_lock_irqsave(&priv->md->db_lock, flags);
+	list_for_each_entry(db_filter, &priv->db_filters, priv_node) {
+		if (db_filter->filter.rioid == filter.rioid &&
+		    db_filter->filter.low == filter.low &&
+		    db_filter->filter.high == filter.high) {
+			rio_mport_delete_db_filter(db_filter);
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&priv->md->db_lock, flags);
+
+	if (!ret)
+		rio_release_inb_dbell(priv->md->mport, filter.low, filter.high);
+
+	return ret;
+}
+
+static int rio_mport_match_pw(union rio_pw_msg *msg,
+			      struct rio_pw_filter *filter)
+{
+	if ((msg->em.comptag & filter->mask) < filter->low ||
+		(msg->em.comptag & filter->mask) > filter->high)
+		return 0;
+	return 1;
+}
+
+static int rio_mport_pw_handler(struct rio_mport *mport, void *context,
+				union rio_pw_msg *msg, int step)
+{
+	struct mport_dev *md = context;
+	struct mport_cdev_priv *priv;
+	struct rio_mport_pw_filter *pw_filter;
+	struct rio_event event;
+	int handled;
+
+	event.header = RIO_PORTWRITE;
+	memcpy(event.u.portwrite.payload, msg->raw, RIO_PW_MSG_SIZE);
+
+	handled = 0;
+	spin_lock(&md->pw_lock);
+	list_for_each_entry(pw_filter, &md->portwrites, md_node) {
+		if (rio_mport_match_pw(msg, &pw_filter->filter)) {
+			priv = pw_filter->priv;
+			rio_mport_add_event(priv, &event);
+			handled = 1;
+		}
+	}
+	spin_unlock(&md->pw_lock);
+
+	if (!handled) {
+		printk_ratelimited(KERN_WARNING DRV_NAME
+			": mport%d received spurious PW from 0x%08x\n",
+			mport->id, msg->em.comptag);
+	}
+
+	return 0;
+}
+
+static int rio_mport_add_pw_filter(struct mport_cdev_priv *priv,
+				   void __user *arg)
+{
+	struct mport_dev *md = priv->md;
+	struct rio_mport_pw_filter *pw_filter;
+	struct rio_pw_filter filter;
+	unsigned long flags;
+	int hadd = 0;
+
+	if (copy_from_user(&filter, arg, sizeof(filter)))
+		return -EFAULT;
+
+	pw_filter = kzalloc(sizeof(*pw_filter), GFP_KERNEL);
+	if (pw_filter == NULL)
+		return -ENOMEM;
+
+	pw_filter->filter = filter;
+	pw_filter->priv = priv;
+	spin_lock_irqsave(&md->pw_lock, flags);
+	if (list_empty(&md->portwrites))
+		hadd = 1;
+	list_add_tail(&pw_filter->priv_node, &priv->pw_filters);
+	list_add_tail(&pw_filter->md_node, &md->portwrites);
+	spin_unlock_irqrestore(&md->pw_lock, flags);
+
+	if (hadd) {
+		int ret;
+
+		ret = rio_add_mport_pw_handler(md->mport, md,
+					       rio_mport_pw_handler);
+		if (ret) {
+			dev_err(&md->dev,
+				"%s: failed to add IB_PW handler, err=%d\n",
+				__func__, ret);
+			return ret;
+		}
+		rio_pw_enable(md->mport, 1);
+	}
+
+	return 0;
+}
+
+static void rio_mport_delete_pw_filter(struct rio_mport_pw_filter *pw_filter)
+{
+	list_del(&pw_filter->md_node);
+	list_del(&pw_filter->priv_node);
+	kfree(pw_filter);
+}
+
+static int rio_mport_match_pw_filter(struct rio_pw_filter *a,
+				     struct rio_pw_filter *b)
+{
+	if ((a->mask == b->mask) && (a->low == b->low) && (a->high == b->high))
+		return 1;
+	return 0;
+}
+
+static int rio_mport_remove_pw_filter(struct mport_cdev_priv *priv,
+				      void __user *arg)
+{
+	struct mport_dev *md = priv->md;
+	struct rio_mport_pw_filter *pw_filter;
+	struct rio_pw_filter filter;
+	unsigned long flags;
+	int ret = -EINVAL;
+	int hdel = 0;
+
+	if (copy_from_user(&filter, arg, sizeof(filter)))
+		return -EFAULT;
+
+	spin_lock_irqsave(&md->pw_lock, flags);
+	list_for_each_entry(pw_filter, &priv->pw_filters, priv_node) {
+		if (rio_mport_match_pw_filter(&pw_filter->filter, &filter)) {
+			rio_mport_delete_pw_filter(pw_filter);
+			ret = 0;
+			break;
+		}
+	}
+
+	if (list_empty(&md->portwrites))
+		hdel = 1;
+	spin_unlock_irqrestore(&md->pw_lock, flags);
+
+	if (hdel) {
+		rio_del_mport_pw_handler(md->mport, priv->md,
+					 rio_mport_pw_handler);
+		rio_pw_enable(md->mport, 0);
+	}
+
+	return ret;
+}
+
+/*
+ * rio_release_dev - release routine for kernel RIO device object
+ * @dev: kernel device object associated with a RIO device structure
+ *
+ * Frees a RIO device struct associated a RIO device struct.
+ * The RIO device struct is freed.
+ */
+static void rio_release_dev(struct device *dev)
+{
+	struct rio_dev *rdev;
+
+	rdev = to_rio_dev(dev);
+	pr_info(DRV_PREFIX "%s: %s\n", __func__, rio_name(rdev));
+	kfree(rdev);
+}
+
+
+static void rio_release_net(struct device *dev)
+{
+	struct rio_net *net;
+
+	net = to_rio_net(dev);
+	rmcd_debug(RDEV, "net_%d", net->id);
+	kfree(net);
+}
+
+
+/*
+ * rio_mport_add_riodev - creates a kernel RIO device object
+ *
+ * Allocates a RIO device data structure and initializes required fields based
+ * on device's configuration space contents.
+ * If the device has switch capabilities, then a switch specific portion is
+ * allocated and configured.
+ */
+static int rio_mport_add_riodev(struct mport_cdev_priv *priv,
+				   void __user *arg)
+{
+	struct mport_dev *md = priv->md;
+	struct rio_rdev_info dev_info;
+	struct rio_dev *rdev;
+	struct rio_switch *rswitch = NULL;
+	struct rio_mport *mport;
+	size_t size;
+	u32 rval;
+	u32 swpinfo = 0;
+	u16 destid;
+	u8 hopcount;
+	int err;
+
+	if (copy_from_user(&dev_info, arg, sizeof(dev_info)))
+		return -EFAULT;
+
+	rmcd_debug(RDEV, "name:%s ct:0x%x did:0x%x hc:0x%x", dev_info.name,
+		   dev_info.comptag, dev_info.destid, dev_info.hopcount);
+
+	if (bus_find_device_by_name(&rio_bus_type, NULL, dev_info.name)) {
+		rmcd_debug(RDEV, "device %s already exists", dev_info.name);
+		return -EEXIST;
+	}
+
+	size = sizeof(struct rio_dev);
+	mport = md->mport;
+	destid = (u16)dev_info.destid;
+	hopcount = (u8)dev_info.hopcount;
+
+	if (rio_mport_read_config_32(mport, destid, hopcount,
+				     RIO_PEF_CAR, &rval))
+		return -EIO;
+
+	if (rval & RIO_PEF_SWITCH) {
+		rio_mport_read_config_32(mport, destid, hopcount,
+					 RIO_SWP_INFO_CAR, &swpinfo);
+		size += (RIO_GET_TOTAL_PORTS(swpinfo) *
+			 sizeof(rswitch->nextdev[0])) + sizeof(*rswitch);
+	}
+
+	rdev = kzalloc(size, GFP_KERNEL);
+	if (rdev == NULL)
+		return -ENOMEM;
+
+	if (mport->net == NULL) {
+		struct rio_net *net;
+
+		net = rio_alloc_net(mport);
+		if (!net) {
+			err = -ENOMEM;
+			rmcd_debug(RDEV, "failed to allocate net object");
+			goto cleanup;
+		}
+
+		net->id = mport->id;
+		net->hport = mport;
+		dev_set_name(&net->dev, "rnet_%d", net->id);
+		net->dev.parent = &mport->dev;
+		net->dev.release = rio_release_net;
+		err = rio_add_net(net);
+		if (err) {
+			rmcd_debug(RDEV, "failed to register net, err=%d", err);
+			kfree(net);
+			goto cleanup;
+		}
+	}
+
+	rdev->net = mport->net;
+	rdev->pef = rval;
+	rdev->swpinfo = swpinfo;
+	rio_mport_read_config_32(mport, destid, hopcount,
+				 RIO_DEV_ID_CAR, &rval);
+	rdev->did = rval >> 16;
+	rdev->vid = rval & 0xffff;
+	rio_mport_read_config_32(mport, destid, hopcount, RIO_DEV_INFO_CAR,
+				 &rdev->device_rev);
+	rio_mport_read_config_32(mport, destid, hopcount, RIO_ASM_ID_CAR,
+				 &rval);
+	rdev->asm_did = rval >> 16;
+	rdev->asm_vid = rval & 0xffff;
+	rio_mport_read_config_32(mport, destid, hopcount, RIO_ASM_INFO_CAR,
+				 &rval);
+	rdev->asm_rev = rval >> 16;
+
+	if (rdev->pef & RIO_PEF_EXT_FEATURES) {
+		rdev->efptr = rval & 0xffff;
+		rdev->phys_efptr = rio_mport_get_physefb(mport, 0, destid,
+							 hopcount);
+
+		rdev->em_efptr = rio_mport_get_feature(mport, 0, destid,
+						hopcount, RIO_EFB_ERR_MGMNT);
+	}
+
+	rio_mport_read_config_32(mport, destid, hopcount, RIO_SRC_OPS_CAR,
+				 &rdev->src_ops);
+	rio_mport_read_config_32(mport, destid, hopcount, RIO_DST_OPS_CAR,
+				 &rdev->dst_ops);
+
+	rdev->comp_tag = dev_info.comptag;
+	rdev->destid = destid;
+	/* hopcount is stored as specified by a caller, regardles of EP or SW */
+	rdev->hopcount = hopcount;
+
+	if (rdev->pef & RIO_PEF_SWITCH) {
+		rswitch = rdev->rswitch;
+		rswitch->route_table = NULL;
+	}
+
+	if (strlen(dev_info.name))
+		dev_set_name(&rdev->dev, "%s", dev_info.name);
+	else if (rdev->pef & RIO_PEF_SWITCH)
+		dev_set_name(&rdev->dev, "%02x:s:%04x", mport->id,
+			     rdev->comp_tag & RIO_CTAG_UDEVID);
+	else
+		dev_set_name(&rdev->dev, "%02x:e:%04x", mport->id,
+			     rdev->comp_tag & RIO_CTAG_UDEVID);
+
+	INIT_LIST_HEAD(&rdev->net_list);
+	rdev->dev.parent = &mport->net->dev;
+	rio_attach_device(rdev);
+	rdev->dev.release = rio_release_dev;
+
+	if (rdev->dst_ops & RIO_DST_OPS_DOORBELL)
+		rio_init_dbell_res(&rdev->riores[RIO_DOORBELL_RESOURCE],
+				   0, 0xffff);
+	err = rio_add_device(rdev);
+	if (err)
+		goto cleanup;
+	rio_dev_get(rdev);
+
+	return 0;
+cleanup:
+	kfree(rdev);
+	return err;
+}
+
+static int rio_mport_del_riodev(struct mport_cdev_priv *priv, void __user *arg)
+{
+	struct rio_rdev_info dev_info;
+	struct rio_dev *rdev = NULL;
+	struct device  *dev;
+	struct rio_mport *mport;
+	struct rio_net *net;
+
+	if (copy_from_user(&dev_info, arg, sizeof(dev_info)))
+		return -EFAULT;
+
+	mport = priv->md->mport;
+
+	/* If device name is specified, removal by name has priority */
+	if (strlen(dev_info.name)) {
+		dev = bus_find_device_by_name(&rio_bus_type, NULL,
+					      dev_info.name);
+		if (dev)
+			rdev = to_rio_dev(dev);
+	} else {
+		do {
+			rdev = rio_get_comptag(dev_info.comptag, rdev);
+			if (rdev && rdev->dev.parent == &mport->net->dev &&
+			    rdev->destid == (u16)dev_info.destid &&
+			    rdev->hopcount == (u8)dev_info.hopcount)
+				break;
+		} while (rdev);
+	}
+
+	if (!rdev) {
+		rmcd_debug(RDEV,
+			"device name:%s ct:0x%x did:0x%x hc:0x%x not found",
+			dev_info.name, dev_info.comptag, dev_info.destid,
+			dev_info.hopcount);
+		return -ENODEV;
+	}
+
+	net = rdev->net;
+	rio_dev_put(rdev);
+	rio_del_device(rdev, RIO_DEVICE_SHUTDOWN);
+
+	if (list_empty(&net->devices)) {
+		rio_free_net(net);
+		mport->net = NULL;
+	}
+
+	return 0;
+}
+
+/*
+ * Mport cdev management
+ */
+
+/*
+ * mport_cdev_open() - Open character device (mport)
+ */
+static int mport_cdev_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+	int minor = iminor(inode);
+	struct mport_dev *chdev;
+	struct mport_cdev_priv *priv;
+
+	/* Test for valid device */
+	if (minor >= RIO_MAX_MPORTS) {
+		rmcd_error("Invalid minor device number");
+		return -EINVAL;
+	}
+
+	chdev = container_of(inode->i_cdev, struct mport_dev, cdev);
+
+	rmcd_debug(INIT, "%s filp=%p", dev_name(&chdev->dev), filp);
+
+	if (atomic_read(&chdev->active) == 0)
+		return -ENODEV;
+
+	get_device(&chdev->dev);
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv) {
+		put_device(&chdev->dev);
+		return -ENOMEM;
+	}
+
+	priv->md = chdev;
+
+	mutex_lock(&chdev->file_mutex);
+	list_add_tail(&priv->list, &chdev->file_list);
+	mutex_unlock(&chdev->file_mutex);
+
+	INIT_LIST_HEAD(&priv->db_filters);
+	INIT_LIST_HEAD(&priv->pw_filters);
+	spin_lock_init(&priv->fifo_lock);
+	init_waitqueue_head(&priv->event_rx_wait);
+	ret = kfifo_alloc(&priv->event_fifo,
+			  sizeof(struct rio_event) * MPORT_EVENT_DEPTH,
+			  GFP_KERNEL);
+	if (ret < 0) {
+		dev_err(&chdev->dev, DRV_NAME ": kfifo_alloc failed\n");
+		ret = -ENOMEM;
+		goto err_fifo;
+	}
+
+#ifdef CONFIG_RAPIDIO_DMA_ENGINE
+	INIT_LIST_HEAD(&priv->async_list);
+	INIT_LIST_HEAD(&priv->pend_list);
+	spin_lock_init(&priv->req_lock);
+	mutex_init(&priv->dma_lock);
+#endif
+
+	filp->private_data = priv;
+	goto out;
+err_fifo:
+	kfree(priv);
+out:
+	return ret;
+}
+
+static int mport_cdev_fasync(int fd, struct file *filp, int mode)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+
+	return fasync_helper(fd, filp, mode, &priv->async_queue);
+}
+
+#ifdef CONFIG_RAPIDIO_DMA_ENGINE
+static void mport_cdev_release_dma(struct file *filp)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+	struct mport_dev *md;
+	struct mport_dma_req *req, *req_next;
+	unsigned long tmo = msecs_to_jiffies(dma_timeout);
+	long wret;
+	LIST_HEAD(list);
+
+	rmcd_debug(EXIT, "from filp=%p %s(%d)",
+		   filp, current->comm, task_pid_nr(current));
+
+	if (!priv->dmach) {
+		rmcd_debug(EXIT, "No DMA channel for filp=%p", filp);
+		return;
+	}
+
+	md = priv->md;
+
+	flush_workqueue(dma_wq);
+
+	spin_lock(&priv->req_lock);
+	if (!list_empty(&priv->async_list)) {
+		rmcd_debug(EXIT, "async list not empty filp=%p %s(%d)",
+			   filp, current->comm, task_pid_nr(current));
+		list_splice_init(&priv->async_list, &list);
+	}
+	spin_unlock(&priv->req_lock);
+
+	if (!list_empty(&list)) {
+		rmcd_debug(EXIT, "temp list not empty");
+		list_for_each_entry_safe(req, req_next, &list, node) {
+			rmcd_debug(EXIT, "free req->filp=%p cookie=%d compl=%s",
+				   req->filp, req->cookie,
+				   completion_done(&req->req_comp)?"yes":"no");
+			list_del(&req->node);
+			dma_req_free(req);
+		}
+	}
+
+	if (!list_empty(&priv->pend_list)) {
+		rmcd_debug(EXIT, "Free pending DMA requests for filp=%p %s(%d)",
+			   filp, current->comm, task_pid_nr(current));
+		list_for_each_entry_safe(req,
+					 req_next, &priv->pend_list, node) {
+			rmcd_debug(EXIT, "free req->filp=%p cookie=%d compl=%s",
+				   req->filp, req->cookie,
+				   completion_done(&req->req_comp)?"yes":"no");
+			list_del(&req->node);
+			dma_req_free(req);
+		}
+	}
+
+	put_dma_channel(priv);
+	wret = wait_for_completion_interruptible_timeout(&priv->comp, tmo);
+
+	if (wret <= 0) {
+		rmcd_error("%s(%d) failed waiting for DMA release err=%ld",
+			current->comm, task_pid_nr(current), wret);
+	}
+
+	spin_lock(&priv->req_lock);
+
+	if (!list_empty(&priv->pend_list)) {
+		rmcd_debug(EXIT, "ATTN: pending DMA requests, filp=%p %s(%d)",
+			   filp, current->comm, task_pid_nr(current));
+	}
+
+	spin_unlock(&priv->req_lock);
+
+	if (priv->dmach != priv->md->dma_chan) {
+		rmcd_debug(EXIT, "Release DMA channel for filp=%p %s(%d)",
+			   filp, current->comm, task_pid_nr(current));
+		rio_release_dma(priv->dmach);
+	} else {
+		rmcd_debug(EXIT, "Adjust default DMA channel refcount");
+		kref_put(&md->dma_ref, mport_release_def_dma);
+	}
+
+	priv->dmach = NULL;
+}
+#else
+#define mport_cdev_release_dma(priv) do {} while (0)
+#endif
+
+/*
+ * mport_cdev_release() - Release character device
+ */
+static int mport_cdev_release(struct inode *inode, struct file *filp)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+	struct mport_dev *chdev;
+	struct rio_mport_pw_filter *pw_filter, *pw_filter_next;
+	struct rio_mport_db_filter *db_filter, *db_filter_next;
+	struct rio_mport_mapping *map, *_map;
+	unsigned long flags;
+
+	rmcd_debug(EXIT, "%s filp=%p", dev_name(&priv->md->dev), filp);
+
+	chdev = priv->md;
+	mport_cdev_release_dma(filp);
+
+	priv->event_mask = 0;
+
+	spin_lock_irqsave(&chdev->pw_lock, flags);
+	if (!list_empty(&priv->pw_filters)) {
+		list_for_each_entry_safe(pw_filter, pw_filter_next,
+					 &priv->pw_filters, priv_node)
+			rio_mport_delete_pw_filter(pw_filter);
+	}
+	spin_unlock_irqrestore(&chdev->pw_lock, flags);
+
+	spin_lock_irqsave(&chdev->db_lock, flags);
+	list_for_each_entry_safe(db_filter, db_filter_next,
+				 &priv->db_filters, priv_node) {
+		rio_mport_delete_db_filter(db_filter);
+	}
+	spin_unlock_irqrestore(&chdev->db_lock, flags);
+
+	kfifo_free(&priv->event_fifo);
+
+	mutex_lock(&chdev->buf_mutex);
+	list_for_each_entry_safe(map, _map, &chdev->mappings, node) {
+		if (map->filp == filp) {
+			rmcd_debug(EXIT, "release mapping %p filp=%p",
+				   map->virt_addr, filp);
+			kref_put(&map->ref, mport_release_mapping);
+		}
+	}
+	mutex_unlock(&chdev->buf_mutex);
+
+	mport_cdev_fasync(-1, filp, 0);
+	filp->private_data = NULL;
+	mutex_lock(&chdev->file_mutex);
+	list_del(&priv->list);
+	mutex_unlock(&chdev->file_mutex);
+	put_device(&chdev->dev);
+	kfree(priv);
+	return 0;
+}
+
+/*
+ * mport_cdev_ioctl() - IOCTLs for character device
+ */
+static long mport_cdev_ioctl(struct file *filp,
+		unsigned int cmd, unsigned long arg)
+{
+	int err = -EINVAL;
+	struct mport_cdev_priv *data = filp->private_data;
+	struct mport_dev *md = data->md;
+
+	if (atomic_read(&md->active) == 0)
+		return -ENODEV;
+
+	switch (cmd) {
+	case RIO_MPORT_MAINT_READ_LOCAL:
+		return rio_mport_maint_rd(data, (void __user *)arg, 1);
+	case RIO_MPORT_MAINT_WRITE_LOCAL:
+		return rio_mport_maint_wr(data, (void __user *)arg, 1);
+	case RIO_MPORT_MAINT_READ_REMOTE:
+		return rio_mport_maint_rd(data, (void __user *)arg, 0);
+	case RIO_MPORT_MAINT_WRITE_REMOTE:
+		return rio_mport_maint_wr(data, (void __user *)arg, 0);
+	case RIO_MPORT_MAINT_HDID_SET:
+		return maint_hdid_set(data, (void __user *)arg);
+	case RIO_MPORT_MAINT_COMPTAG_SET:
+		return maint_comptag_set(data, (void __user *)arg);
+	case RIO_MPORT_MAINT_PORT_IDX_GET:
+		return maint_port_idx_get(data, (void __user *)arg);
+	case RIO_MPORT_GET_PROPERTIES:
+		md->properties.hdid = md->mport->host_deviceid;
+		if (copy_to_user((void __user *)arg, &(data->md->properties),
+				 sizeof(data->md->properties)))
+			return -EFAULT;
+		return 0;
+	case RIO_ENABLE_DOORBELL_RANGE:
+		return rio_mport_add_db_filter(data, (void __user *)arg);
+	case RIO_DISABLE_DOORBELL_RANGE:
+		return rio_mport_remove_db_filter(data, (void __user *)arg);
+	case RIO_ENABLE_PORTWRITE_RANGE:
+		return rio_mport_add_pw_filter(data, (void __user *)arg);
+	case RIO_DISABLE_PORTWRITE_RANGE:
+		return rio_mport_remove_pw_filter(data, (void __user *)arg);
+	case RIO_SET_EVENT_MASK:
+		data->event_mask = arg;
+		return 0;
+	case RIO_GET_EVENT_MASK:
+		if (copy_to_user((void __user *)arg, &data->event_mask,
+				    sizeof(data->event_mask)))
+			return -EFAULT;
+		return 0;
+	case RIO_MAP_OUTBOUND:
+		return rio_mport_obw_map(filp, (void __user *)arg);
+	case RIO_MAP_INBOUND:
+		return rio_mport_map_inbound(filp, (void __user *)arg);
+	case RIO_UNMAP_OUTBOUND:
+		return rio_mport_obw_free(filp, (void __user *)arg);
+	case RIO_UNMAP_INBOUND:
+		return rio_mport_inbound_free(filp, (void __user *)arg);
+	case RIO_ALLOC_DMA:
+		return rio_mport_alloc_dma(filp, (void __user *)arg);
+	case RIO_FREE_DMA:
+		return rio_mport_free_dma(filp, (void __user *)arg);
+	case RIO_WAIT_FOR_ASYNC:
+		return rio_mport_wait_for_async_dma(filp, (void __user *)arg);
+	case RIO_TRANSFER:
+		return rio_mport_transfer_ioctl(filp, (void __user *)arg);
+	case RIO_DEV_ADD:
+		return rio_mport_add_riodev(data, (void __user *)arg);
+	case RIO_DEV_DEL:
+		return rio_mport_del_riodev(data, (void __user *)arg);
+	default:
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * mport_release_mapping - free mapping resources and info structure
+ * @ref: a pointer to the kref within struct rio_mport_mapping
+ *
+ * NOTE: Shall be called while holding buf_mutex.
+ */
+static void mport_release_mapping(struct kref *ref)
+{
+	struct rio_mport_mapping *map =
+			container_of(ref, struct rio_mport_mapping, ref);
+	struct rio_mport *mport = map->md->mport;
+
+	rmcd_debug(MMAP, "type %d mapping @ %p (phys = %pad) for %s",
+		   map->dir, map->virt_addr,
+		   &map->phys_addr, mport->name);
+
+	list_del(&map->node);
+
+	switch (map->dir) {
+	case MAP_INBOUND:
+		rio_unmap_inb_region(mport, map->phys_addr);
+	case MAP_DMA:
+		dma_free_coherent(mport->dev.parent, map->size,
+				  map->virt_addr, map->phys_addr);
+		break;
+	case MAP_OUTBOUND:
+		rio_unmap_outb_region(mport, map->rioid, map->rio_addr);
+		break;
+	}
+	kfree(map);
+}
+
+static void mport_mm_open(struct vm_area_struct *vma)
+{
+	struct rio_mport_mapping *map = vma->vm_private_data;
+
+rmcd_debug(MMAP, "0x%pad", &map->phys_addr);
+	kref_get(&map->ref);
+}
+
+static void mport_mm_close(struct vm_area_struct *vma)
+{
+	struct rio_mport_mapping *map = vma->vm_private_data;
+
+rmcd_debug(MMAP, "0x%pad", &map->phys_addr);
+	mutex_lock(&map->md->buf_mutex);
+	kref_put(&map->ref, mport_release_mapping);
+	mutex_unlock(&map->md->buf_mutex);
+}
+
+static const struct vm_operations_struct vm_ops = {
+	.open =	mport_mm_open,
+	.close = mport_mm_close,
+};
+
+static int mport_cdev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+	struct mport_dev *md;
+	size_t size = vma->vm_end - vma->vm_start;
+	dma_addr_t baddr;
+	unsigned long offset;
+	int found = 0, ret;
+	struct rio_mport_mapping *map;
+
+	rmcd_debug(MMAP, "0x%x bytes at offset 0x%lx",
+		   (unsigned int)size, vma->vm_pgoff);
+
+	md = priv->md;
+	baddr = ((dma_addr_t)vma->vm_pgoff << PAGE_SHIFT);
+
+	mutex_lock(&md->buf_mutex);
+	list_for_each_entry(map, &md->mappings, node) {
+		if (baddr >= map->phys_addr &&
+		    baddr < (map->phys_addr + map->size)) {
+			found = 1;
+			break;
+		}
+	}
+	mutex_unlock(&md->buf_mutex);
+
+	if (!found)
+		return -ENOMEM;
+
+	offset = baddr - map->phys_addr;
+
+	if (size + offset > map->size)
+		return -EINVAL;
+
+	vma->vm_pgoff = offset >> PAGE_SHIFT;
+	rmcd_debug(MMAP, "MMAP adjusted offset = 0x%lx", vma->vm_pgoff);
+
+	if (map->dir == MAP_INBOUND || map->dir == MAP_DMA)
+		ret = dma_mmap_coherent(md->mport->dev.parent, vma,
+				map->virt_addr, map->phys_addr, map->size);
+	else if (map->dir == MAP_OUTBOUND) {
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		ret = vm_iomap_memory(vma, map->phys_addr, map->size);
+	} else {
+		rmcd_error("Attempt to mmap unsupported mapping type");
+		ret = -EIO;
+	}
+
+	if (!ret) {
+		vma->vm_private_data = map;
+		vma->vm_ops = &vm_ops;
+		mport_mm_open(vma);
+	} else {
+		rmcd_error("MMAP exit with err=%d", ret);
+	}
+
+	return ret;
+}
+
+static unsigned int mport_cdev_poll(struct file *filp, poll_table *wait)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+
+	poll_wait(filp, &priv->event_rx_wait, wait);
+	if (kfifo_len(&priv->event_fifo))
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+static ssize_t mport_read(struct file *filp, char __user *buf, size_t count,
+			loff_t *ppos)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+	int copied;
+	ssize_t ret;
+
+	if (!count)
+		return 0;
+
+	if (kfifo_is_empty(&priv->event_fifo) &&
+	    (filp->f_flags & O_NONBLOCK))
+		return -EAGAIN;
+
+	if (count % sizeof(struct rio_event))
+		return -EINVAL;
+
+	ret = wait_event_interruptible(priv->event_rx_wait,
+					kfifo_len(&priv->event_fifo) != 0);
+	if (ret)
+		return ret;
+
+	while (ret < count) {
+		if (kfifo_to_user(&priv->event_fifo, buf,
+		      sizeof(struct rio_event), &copied))
+			return -EFAULT;
+		ret += copied;
+		buf += copied;
+	}
+
+	return ret;
+}
+
+static ssize_t mport_write(struct file *filp, const char __user *buf,
+			 size_t count, loff_t *ppos)
+{
+	struct mport_cdev_priv *priv = filp->private_data;
+	struct rio_mport *mport = priv->md->mport;
+	struct rio_event event;
+	int len, ret;
+
+	if (!count)
+		return 0;
+
+	if (count % sizeof(event))
+		return -EINVAL;
+
+	len = 0;
+	while ((count - len) >= (int)sizeof(event)) {
+		if (copy_from_user(&event, buf, sizeof(event)))
+			return -EFAULT;
+
+		if (event.header != RIO_DOORBELL)
+			return -EINVAL;
+
+		ret = rio_mport_send_doorbell(mport,
+					      (u16)event.u.doorbell.rioid,
+					      event.u.doorbell.payload);
+		if (ret < 0)
+			return ret;
+
+		len += sizeof(event);
+		buf += sizeof(event);
+	}
+
+	return len;
+}
+
+static const struct file_operations mport_fops = {
+	.owner		= THIS_MODULE,
+	.open		= mport_cdev_open,
+	.release	= mport_cdev_release,
+	.poll		= mport_cdev_poll,
+	.read		= mport_read,
+	.write		= mport_write,
+	.mmap		= mport_cdev_mmap,
+	.fasync		= mport_cdev_fasync,
+	.unlocked_ioctl = mport_cdev_ioctl
+};
+
+/*
+ * Character device management
+ */
+
+static void mport_device_release(struct device *dev)
+{
+	struct mport_dev *md;
+
+	rmcd_debug(EXIT, "%s", dev_name(dev));
+	md = container_of(dev, struct mport_dev, dev);
+	kfree(md);
+}
+
+/*
+ * mport_cdev_add() - Create mport_dev from rio_mport
+ * @mport:	RapidIO master port
+ */
+static struct mport_dev *mport_cdev_add(struct rio_mport *mport)
+{
+	int ret = 0;
+	struct mport_dev *md;
+	struct rio_mport_attr attr;
+
+	md = kzalloc(sizeof(struct mport_dev), GFP_KERNEL);
+	if (!md) {
+		rmcd_error("Unable allocate a device object");
+		return NULL;
+	}
+
+	md->mport = mport;
+	mutex_init(&md->buf_mutex);
+	mutex_init(&md->file_mutex);
+	INIT_LIST_HEAD(&md->file_list);
+	cdev_init(&md->cdev, &mport_fops);
+	md->cdev.owner = THIS_MODULE;
+	ret = cdev_add(&md->cdev, MKDEV(MAJOR(dev_number), mport->id), 1);
+	if (ret < 0) {
+		kfree(md);
+		rmcd_error("Unable to register a device, err=%d", ret);
+		return NULL;
+	}
+
+	md->dev.devt = md->cdev.dev;
+	md->dev.class = dev_class;
+	md->dev.parent = &mport->dev;
+	md->dev.release = mport_device_release;
+	dev_set_name(&md->dev, DEV_NAME "%d", mport->id);
+	atomic_set(&md->active, 1);
+
+	ret = device_register(&md->dev);
+	if (ret) {
+		rmcd_error("Failed to register mport %d (err=%d)",
+		       mport->id, ret);
+		goto err_cdev;
+	}
+
+	get_device(&md->dev);
+
+	INIT_LIST_HEAD(&md->doorbells);
+	spin_lock_init(&md->db_lock);
+	INIT_LIST_HEAD(&md->portwrites);
+	spin_lock_init(&md->pw_lock);
+	INIT_LIST_HEAD(&md->mappings);
+
+	md->properties.id = mport->id;
+	md->properties.sys_size = mport->sys_size;
+	md->properties.hdid = mport->host_deviceid;
+	md->properties.index = mport->index;
+
+	/* The transfer_mode property will be returned through mport query
+	 * interface
+	 */
+#ifdef CONFIG_PPC /* for now: only on Freescale's SoCs */
+	md->properties.transfer_mode |= RIO_TRANSFER_MODE_MAPPED;
+#else
+	md->properties.transfer_mode |= RIO_TRANSFER_MODE_TRANSFER;
+#endif
+	ret = rio_query_mport(mport, &attr);
+	if (!ret) {
+		md->properties.flags = attr.flags;
+		md->properties.link_speed = attr.link_speed;
+		md->properties.link_width = attr.link_width;
+		md->properties.dma_max_sge = attr.dma_max_sge;
+		md->properties.dma_max_size = attr.dma_max_size;
+		md->properties.dma_align = attr.dma_align;
+		md->properties.cap_sys_size = 0;
+		md->properties.cap_transfer_mode = 0;
+		md->properties.cap_addr_size = 0;
+	} else
+		pr_info(DRV_PREFIX "Failed to obtain info for %s cdev(%d:%d)\n",
+			mport->name, MAJOR(dev_number), mport->id);
+
+	mutex_lock(&mport_devs_lock);
+	list_add_tail(&md->node, &mport_devs);
+	mutex_unlock(&mport_devs_lock);
+
+	pr_info(DRV_PREFIX "Added %s cdev(%d:%d)\n",
+		mport->name, MAJOR(dev_number), mport->id);
+
+	return md;
+
+err_cdev:
+	cdev_del(&md->cdev);
+	kfree(md);
+	return NULL;
+}
+
+/*
+ * mport_cdev_terminate_dma() - Stop all active DMA data transfers and release
+ *                              associated DMA channels.
+ */
+static void mport_cdev_terminate_dma(struct mport_dev *md)
+{
+#ifdef CONFIG_RAPIDIO_DMA_ENGINE
+	struct mport_cdev_priv *client;
+
+	rmcd_debug(DMA, "%s", dev_name(&md->dev));
+
+	mutex_lock(&md->file_mutex);
+	list_for_each_entry(client, &md->file_list, list) {
+		if (client->dmach) {
+			dmaengine_terminate_all(client->dmach);
+			rio_release_dma(client->dmach);
+		}
+	}
+	mutex_unlock(&md->file_mutex);
+
+	if (md->dma_chan) {
+		dmaengine_terminate_all(md->dma_chan);
+		rio_release_dma(md->dma_chan);
+		md->dma_chan = NULL;
+	}
+#endif
+}
+
+
+/*
+ * mport_cdev_kill_fasync() - Send SIGIO signal to all processes with open
+ *                            mport_cdev files.
+ */
+static int mport_cdev_kill_fasync(struct mport_dev *md)
+{
+	unsigned int files = 0;
+	struct mport_cdev_priv *client;
+
+	mutex_lock(&md->file_mutex);
+	list_for_each_entry(client, &md->file_list, list) {
+		if (client->async_queue)
+			kill_fasync(&client->async_queue, SIGIO, POLL_HUP);
+		files++;
+	}
+	mutex_unlock(&md->file_mutex);
+	return files;
+}
+
+/*
+ * mport_cdev_remove() - Remove mport character device
+ * @dev:	Mport device to remove
+ */
+static void mport_cdev_remove(struct mport_dev *md)
+{
+	struct rio_mport_mapping *map, *_map;
+
+	rmcd_debug(EXIT, "Remove %s cdev", md->mport->name);
+	atomic_set(&md->active, 0);
+	mport_cdev_terminate_dma(md);
+	rio_del_mport_pw_handler(md->mport, md, rio_mport_pw_handler);
+	cdev_del(&(md->cdev));
+	mport_cdev_kill_fasync(md);
+
+	flush_workqueue(dma_wq);
+
+	/* TODO: do we need to give clients some time to close file
+	 * descriptors? Simple wait for XX, or kref?
+	 */
+
+	/*
+	 * Release DMA buffers allocated for the mport device.
+	 * Disable associated inbound Rapidio requests mapping if applicable.
+	 */
+	mutex_lock(&md->buf_mutex);
+	list_for_each_entry_safe(map, _map, &md->mappings, node) {
+		kref_put(&map->ref, mport_release_mapping);
+	}
+	mutex_unlock(&md->buf_mutex);
+
+	if (!list_empty(&md->mappings))
+		rmcd_warn("WARNING: %s pending mappings on removal",
+			  md->mport->name);
+
+	rio_release_inb_dbell(md->mport, 0, 0x0fff);
+
+	device_unregister(&md->dev);
+	put_device(&md->dev);
+}
+
+/*
+ * RIO rio_mport_interface driver
+ */
+
+/*
+ * mport_add_mport() - Add rio_mport from LDM device struct
+ * @dev:		Linux device model struct
+ * @class_intf:	Linux class_interface
+ */
+static int mport_add_mport(struct device *dev,
+		struct class_interface *class_intf)
+{
+	struct rio_mport *mport = NULL;
+	struct mport_dev *chdev = NULL;
+
+	mport = to_rio_mport(dev);
+	if (!mport)
+		return -ENODEV;
+
+	chdev = mport_cdev_add(mport);
+	if (!chdev)
+		return -ENODEV;
+
+	return 0;
+}
+
+/*
+ * mport_remove_mport() - Remove rio_mport from global list
+ * TODO remove device from global mport_dev list
+ */
+static void mport_remove_mport(struct device *dev,
+		struct class_interface *class_intf)
+{
+	struct rio_mport *mport = NULL;
+	struct mport_dev *chdev;
+	int found = 0;
+
+	mport = to_rio_mport(dev);
+	rmcd_debug(EXIT, "Remove %s", mport->name);
+
+	mutex_lock(&mport_devs_lock);
+	list_for_each_entry(chdev, &mport_devs, node) {
+		if (chdev->mport->id == mport->id) {
+			atomic_set(&chdev->active, 0);
+			list_del(&chdev->node);
+			found = 1;
+			break;
+		}
+	}
+	mutex_unlock(&mport_devs_lock);
+
+	if (found)
+		mport_cdev_remove(chdev);
+}
+
+/* the rio_mport_interface is used to handle local mport devices */
+static struct class_interface rio_mport_interface __refdata = {
+	.class		= &rio_mport_class,
+	.add_dev	= mport_add_mport,
+	.remove_dev	= mport_remove_mport,
+};
+
+/*
+ * Linux kernel module
+ */
+
+/*
+ * mport_init - Driver module loading
+ */
+static int __init mport_init(void)
+{
+	int ret;
+
+	/* Create device class needed by udev */
+	dev_class = class_create(THIS_MODULE, DRV_NAME);
+	if (!dev_class) {
+		rmcd_error("Unable to create " DRV_NAME " class");
+		return -EINVAL;
+	}
+
+	ret = alloc_chrdev_region(&dev_number, 0, RIO_MAX_MPORTS, DRV_NAME);
+	if (ret < 0)
+		goto err_chr;
+
+	rmcd_debug(INIT, "Registered class with major=%d", MAJOR(dev_number));
+
+	/* Register to rio_mport_interface */
+	ret = class_interface_register(&rio_mport_interface);
+	if (ret) {
+		rmcd_error("class_interface_register() failed, err=%d", ret);
+		goto err_cli;
+	}
+
+	dma_wq = create_singlethread_workqueue("dma_wq");
+	if (!dma_wq) {
+		rmcd_error("failed to create DMA work queue");
+		ret = -ENOMEM;
+		goto err_wq;
+	}
+
+	return 0;
+
+err_wq:
+	class_interface_unregister(&rio_mport_interface);
+err_cli:
+	unregister_chrdev_region(dev_number, RIO_MAX_MPORTS);
+err_chr:
+	class_destroy(dev_class);
+	return ret;
+}
+
+/**
+ * mport_exit - Driver module unloading
+ */
+static void __exit mport_exit(void)
+{
+	class_interface_unregister(&rio_mport_interface);
+	class_destroy(dev_class);
+	unregister_chrdev_region(dev_number, RIO_MAX_MPORTS);
+	destroy_workqueue(dma_wq);
+}
+
+module_init(mport_init);
+module_exit(mport_exit);
diff --git a/include/linux/rio_mport_cdev.h b/include/linux/rio_mport_cdev.h
new file mode 100644
index 000000000000..b65d19df76d2
--- /dev/null
+++ b/include/linux/rio_mport_cdev.h
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2015-2016, Integrated Device Technology Inc.
+ * Copyright (c) 2015, Prodrive Technologies
+ * Copyright (c) 2015, Texas Instruments Incorporated
+ * Copyright (c) 2015, RapidIO Trade Association
+ * All rights reserved.
+ *
+ * This software is available to you under a choice of one of two licenses.
+ * You may choose to be licensed under the terms of the GNU General Public
+ * License(GPL) Version 2, or the BSD-3 Clause license below:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RIO_MPORT_CDEV_H_
+#define _RIO_MPORT_CDEV_H_
+
+#ifndef __user
+#define __user
+#endif
+
+struct rio_mport_maint_io {
+	uint32_t rioid;		/* destID of remote device */
+	uint32_t hopcount;	/* hopcount to remote device */
+	uint32_t offset;	/* offset in register space */
+	size_t length;		/* length in bytes */
+	void __user *buffer;	/* data buffer */
+};
+
+/*
+ * Definitions for RapidIO data transfers:
+ * - memory mapped (MAPPED)
+ * - packet generation from memory (TRANSFER)
+ */
+#define RIO_TRANSFER_MODE_MAPPED	(1 << 0)
+#define RIO_TRANSFER_MODE_TRANSFER	(1 << 1)
+#define RIO_CAP_DBL_SEND		(1 << 2)
+#define RIO_CAP_DBL_RECV		(1 << 3)
+#define RIO_CAP_PW_SEND			(1 << 4)
+#define RIO_CAP_PW_RECV			(1 << 5)
+#define RIO_CAP_MAP_OUTB		(1 << 6)
+#define RIO_CAP_MAP_INB			(1 << 7)
+
+struct rio_mport_properties {
+	uint16_t hdid;
+	uint8_t id;			/* Physical port ID */
+	uint8_t  index;
+	uint32_t flags;
+	uint32_t sys_size;		/* Default addressing size */
+	uint8_t  port_ok;
+	uint8_t  link_speed;
+	uint8_t  link_width;
+	uint32_t dma_max_sge;
+	uint32_t dma_max_size;
+	uint32_t dma_align;
+	uint32_t transfer_mode;		/* Default transfer mode */
+	uint32_t cap_sys_size;		/* Capable system sizes */
+	uint32_t cap_addr_size;		/* Capable addressing sizes */
+	uint32_t cap_transfer_mode;	/* Capable transfer modes */
+	uint32_t cap_mport;		/* Mport capabilities */
+};
+
+/*
+ * Definitions for RapidIO events;
+ * - incoming port-writes
+ * - incoming doorbells
+ */
+#define RIO_DOORBELL	(1 << 0)
+#define RIO_PORTWRITE	(1 << 1)
+
+struct rio_doorbell {
+	uint32_t rioid;
+	uint16_t payload;
+};
+
+struct rio_doorbell_filter {
+	uint32_t rioid;			/* 0xffffffff to match all ids */
+	uint16_t low;
+	uint16_t high;
+};
+
+
+struct rio_portwrite {
+	uint32_t payload[16];
+};
+
+struct rio_pw_filter {
+	uint32_t mask;
+	uint32_t low;
+	uint32_t high;
+};
+
+/* RapidIO base address for inbound requests set to value defined below
+ * indicates that no specific RIO-to-local address translation is requested
+ * and driver should use direct (one-to-one) address mapping.
+*/
+#define RIO_MAP_ANY_ADDR	(uint64_t)(~((uint64_t) 0))
+
+struct rio_mmap {
+	uint32_t rioid;
+	uint64_t rio_addr;
+	uint64_t length;
+	uint64_t handle;
+	void *address;
+};
+
+struct rio_dma_mem {
+	uint64_t length;		/* length of DMA memory */
+	uint64_t dma_handle;		/* handle associated with this memory */
+	void *buffer;			/* pointer to this memory */
+};
+
+
+struct rio_event {
+	unsigned int header;	/* event type RIO_DOORBELL or RIO_PORTWRITE */
+	union {
+		struct rio_doorbell doorbell;	/* header for RIO_DOORBELL */
+		struct rio_portwrite portwrite; /* header for RIO_PORTWRITE */
+	} u;
+};
+
+enum rio_transfer_sync {
+	RIO_TRANSFER_SYNC,	/* synchronous transfer */
+	RIO_TRANSFER_ASYNC,	/* asynchronous transfer */
+	RIO_TRANSFER_FAF,	/* fire-and-forget transfer */
+};
+
+enum rio_transfer_dir {
+	RIO_TRANSFER_DIR_READ,	/* Read operation */
+	RIO_TRANSFER_DIR_WRITE,	/* Write operation */
+};
+
+/*
+ * RapidIO data exchange transactions are lists of individual transfers. Each
+ * transfer exchanges data between two RapidIO devices by remote direct memory
+ * access and has its own completion code.
+ *
+ * The RapidIO specification defines four types of data exchange requests:
+ * NREAD, NWRITE, SWRITE and NWRITE_R. The RapidIO DMA channel interface allows
+ * to specify the required type of write operation or combination of them when
+ * only the last data packet requires response.
+ *
+ * NREAD:    read up to 256 bytes from remote device memory into local memory
+ * NWRITE:   write up to 256 bytes from local memory to remote device memory
+ *           without confirmation
+ * SWRITE:   as NWRITE, but all addresses and payloads must be 64-bit aligned
+ * NWRITE_R: as NWRITE, but expect acknowledgment from remote device.
+ *
+ * The default exchange is chosen from NREAD and any of the WRITE modes as the
+ * driver sees fit. For write requests the user can explicitly choose between
+ * any of the write modes for each transaction.
+ */
+enum rio_exchange {
+	RIO_EXCHANGE_DEFAULT,	/* Default method */
+	RIO_EXCHANGE_NWRITE,	/* All packets using NWRITE */
+	RIO_EXCHANGE_SWRITE,	/* All packets using SWRITE */
+	RIO_EXCHANGE_NWRITE_R,	/* Last packet NWRITE_R, others NWRITE */
+	RIO_EXCHANGE_SWRITE_R,	/* Last packet NWRITE_R, others SWRITE */
+	RIO_EXCHANGE_NWRITE_R_ALL, /* All packets using NWRITE_R */
+};
+
+struct rio_transfer_io {
+	uint32_t rioid;			/* Target destID */
+	uint64_t rio_addr;		/* Address in target's RIO mem space */
+	enum rio_exchange method;	/* Data exchange method */
+	void __user *loc_addr;
+	uint64_t handle;
+	uint64_t offset;		/* Offset in buffer */
+	uint64_t length;		/* Length in bytes */
+	uint32_t completion_code;	/* Completion code for this transfer */
+};
+
+struct rio_transaction {
+	uint32_t transfer_mode;		/* Data transfer mode */
+	enum rio_transfer_sync sync;	/* Synchronization method */
+	enum rio_transfer_dir dir;	/* Transfer direction */
+	size_t count;			/* Number of transfers */
+	struct rio_transfer_io __user *block;	/* Array of <count> transfers */
+};
+
+struct rio_async_tx_wait {
+	uint32_t token;		/* DMA transaction ID token */
+	uint32_t timeout;	/* Wait timeout in msec, if 0 use default TO */
+};
+
+#define RIO_MAX_DEVNAME_SZ	20
+
+struct rio_rdev_info {
+	uint32_t destid;
+	uint8_t hopcount;
+	uint32_t comptag;
+	char name[RIO_MAX_DEVNAME_SZ + 1];
+};
+
+/* Driver IOCTL codes */
+#define RIO_MPORT_DRV_MAGIC           'm'
+
+#define RIO_MPORT_MAINT_HDID_SET	\
+	_IOW(RIO_MPORT_DRV_MAGIC, 1, uint16_t)
+#define RIO_MPORT_MAINT_COMPTAG_SET	\
+	_IOW(RIO_MPORT_DRV_MAGIC, 2, uint32_t)
+#define RIO_MPORT_MAINT_PORT_IDX_GET	\
+	_IOR(RIO_MPORT_DRV_MAGIC, 3, uint32_t)
+#define RIO_MPORT_GET_PROPERTIES \
+	_IOR(RIO_MPORT_DRV_MAGIC, 4, struct rio_mport_properties)
+#define RIO_MPORT_MAINT_READ_LOCAL \
+	_IOR(RIO_MPORT_DRV_MAGIC, 5, struct rio_mport_maint_io)
+#define RIO_MPORT_MAINT_WRITE_LOCAL \
+	_IOW(RIO_MPORT_DRV_MAGIC, 6, struct rio_mport_maint_io)
+#define RIO_MPORT_MAINT_READ_REMOTE \
+	_IOR(RIO_MPORT_DRV_MAGIC, 7, struct rio_mport_maint_io)
+#define RIO_MPORT_MAINT_WRITE_REMOTE \
+	_IOW(RIO_MPORT_DRV_MAGIC, 8, struct rio_mport_maint_io)
+#define RIO_ENABLE_DOORBELL_RANGE	\
+	_IOW(RIO_MPORT_DRV_MAGIC, 9, struct rio_doorbell_filter)
+#define RIO_DISABLE_DOORBELL_RANGE	\
+	_IOW(RIO_MPORT_DRV_MAGIC, 10, struct rio_doorbell_filter)
+#define RIO_ENABLE_PORTWRITE_RANGE	\
+	_IOW(RIO_MPORT_DRV_MAGIC, 11, struct rio_pw_filter)
+#define RIO_DISABLE_PORTWRITE_RANGE	\
+	_IOW(RIO_MPORT_DRV_MAGIC, 12, struct rio_pw_filter)
+#define RIO_SET_EVENT_MASK		\
+	_IOW(RIO_MPORT_DRV_MAGIC, 13, unsigned int)
+#define RIO_GET_EVENT_MASK		\
+	_IOR(RIO_MPORT_DRV_MAGIC, 14, unsigned int)
+#define RIO_MAP_OUTBOUND \
+	_IOWR(RIO_MPORT_DRV_MAGIC, 15, struct rio_mmap)
+#define RIO_UNMAP_OUTBOUND \
+	_IOW(RIO_MPORT_DRV_MAGIC, 16, struct rio_mmap)
+#define RIO_MAP_INBOUND \
+	_IOWR(RIO_MPORT_DRV_MAGIC, 17, struct rio_mmap)
+#define RIO_UNMAP_INBOUND \
+	_IOW(RIO_MPORT_DRV_MAGIC, 18, uint64_t)
+#define RIO_ALLOC_DMA \
+	_IOWR(RIO_MPORT_DRV_MAGIC, 19, struct rio_dma_mem)
+#define RIO_FREE_DMA \
+	_IOW(RIO_MPORT_DRV_MAGIC, 20, uint64_t)
+#define RIO_TRANSFER \
+	_IOWR(RIO_MPORT_DRV_MAGIC, 21, struct rio_transaction)
+#define RIO_WAIT_FOR_ASYNC \
+	_IOW(RIO_MPORT_DRV_MAGIC, 22, struct rio_async_tx_wait)
+#define RIO_DEV_ADD \
+	_IOW(RIO_MPORT_DRV_MAGIC, 23, struct rio_rdev_info)
+#define RIO_DEV_DEL \
+	_IOW(RIO_MPORT_DRV_MAGIC, 24, struct rio_rdev_info)
+
+#endif /* _RIO_MPORT_CDEV_H_ */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 0495884defc1..b71fd0b5cbad 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -354,6 +354,7 @@ header-y += reiserfs_fs.h
 header-y += reiserfs_xattr.h
 header-y += resource.h
 header-y += rfkill.h
+header-y += rio_mport_cdev.h
 header-y += romfs_fs.h
 header-y += rose.h
 header-y += route.h
-- 
cgit v1.2.3


From 5c9a8750a6409c63a0f01d51a9024861022f6593 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Tue, 22 Mar 2016 14:27:30 -0700
Subject: kernel: add kcov code coverage

kcov provides code coverage collection for coverage-guided fuzzing
(randomized testing).  Coverage-guided fuzzing is a testing technique
that uses coverage feedback to determine new interesting inputs to a
system.  A notable user-space example is AFL
(http://lcamtuf.coredump.cx/afl/).  However, this technique is not
widely used for kernel testing due to missing compiler and kernel
support.

kcov does not aim to collect as much coverage as possible.  It aims to
collect more or less stable coverage that is function of syscall inputs.
To achieve this goal it does not collect coverage in soft/hard
interrupts and instrumentation of some inherently non-deterministic or
non-interesting parts of kernel is disbled (e.g.  scheduler, locking).

Currently there is a single coverage collection mode (tracing), but the
API anticipates additional collection modes.  Initially I also
implemented a second mode which exposes coverage in a fixed-size hash
table of counters (what Quentin used in his original patch).  I've
dropped the second mode for simplicity.

This patch adds the necessary support on kernel side.  The complimentary
compiler support was added in gcc revision 231296.

We've used this support to build syzkaller system call fuzzer, which has
found 90 kernel bugs in just 2 months:

  https://github.com/google/syzkaller/wiki/Found-Bugs

We've also found 30+ bugs in our internal systems with syzkaller.
Another (yet unexplored) direction where kcov coverage would greatly
help is more traditional "blob mutation".  For example, mounting a
random blob as a filesystem, or receiving a random blob over wire.

Why not gcov.  Typical fuzzing loop looks as follows: (1) reset
coverage, (2) execute a bit of code, (3) collect coverage, repeat.  A
typical coverage can be just a dozen of basic blocks (e.g.  an invalid
input).  In such context gcov becomes prohibitively expensive as
reset/collect coverage steps depend on total number of basic
blocks/edges in program (in case of kernel it is about 2M).  Cost of
kcov depends only on number of executed basic blocks/edges.  On top of
that, kernel requires per-thread coverage because there are always
background threads and unrelated processes that also produce coverage.
With inlined gcov instrumentation per-thread coverage is not possible.

kcov exposes kernel PCs and control flow to user-space which is
insecure.  But debugfs should not be mapped as user accessible.

Based on a patch by Quentin Casasnovas.

[akpm@linux-foundation.org: make task_struct.kcov_mode have type `enum kcov_mode']
[akpm@linux-foundation.org: unbreak allmodconfig]
[akpm@linux-foundation.org: follow x86 Makefile layout standards]
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: syzkaller <syzkaller@googlegroups.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Tavis Ormandy <taviso@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Kees Cook <keescook@google.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: David Drysdale <drysdale@google.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kcov.txt                | 111 ++++++++++++++
 Makefile                              |  11 +-
 arch/x86/Kconfig                      |   1 +
 arch/x86/boot/Makefile                |   7 +
 arch/x86/boot/compressed/Makefile     |   3 +
 arch/x86/entry/vdso/Makefile          |   3 +
 arch/x86/kernel/Makefile              |   6 +
 arch/x86/kernel/apic/Makefile         |   4 +
 arch/x86/kernel/cpu/Makefile          |   4 +
 arch/x86/lib/Makefile                 |   3 +
 arch/x86/mm/Makefile                  |   3 +
 arch/x86/realmode/rm/Makefile         |   3 +
 drivers/firmware/efi/libstub/Makefile |   3 +
 include/linux/kcov.h                  |  29 ++++
 include/linux/sched.h                 |  11 ++
 include/uapi/linux/kcov.h             |  10 ++
 kernel/Makefile                       |  12 ++
 kernel/exit.c                         |   2 +
 kernel/fork.c                         |   3 +
 kernel/kcov.c                         | 273 ++++++++++++++++++++++++++++++++++
 kernel/locking/Makefile               |   3 +
 kernel/rcu/Makefile                   |   4 +
 kernel/sched/Makefile                 |   4 +
 lib/Kconfig.debug                     |  21 +++
 lib/Makefile                          |  12 ++
 mm/Makefile                           |  15 ++
 mm/kasan/Makefile                     |   1 +
 scripts/Makefile.lib                  |   6 +
 28 files changed, 567 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/kcov.txt
 create mode 100644 include/linux/kcov.h
 create mode 100644 include/uapi/linux/kcov.h
 create mode 100644 kernel/kcov.c

(limited to 'include/uapi/linux')

diff --git a/Documentation/kcov.txt b/Documentation/kcov.txt
new file mode 100644
index 000000000000..779ff4ab1c1d
--- /dev/null
+++ b/Documentation/kcov.txt
@@ -0,0 +1,111 @@
+kcov: code coverage for fuzzing
+===============================
+
+kcov exposes kernel code coverage information in a form suitable for coverage-
+guided fuzzing (randomized testing). Coverage data of a running kernel is
+exported via the "kcov" debugfs file. Coverage collection is enabled on a task
+basis, and thus it can capture precise coverage of a single system call.
+
+Note that kcov does not aim to collect as much coverage as possible. It aims
+to collect more or less stable coverage that is function of syscall inputs.
+To achieve this goal it does not collect coverage in soft/hard interrupts
+and instrumentation of some inherently non-deterministic parts of kernel is
+disbled (e.g. scheduler, locking).
+
+Usage:
+======
+
+Configure kernel with:
+
+        CONFIG_KCOV=y
+
+CONFIG_KCOV requires gcc built on revision 231296 or later.
+Profiling data will only become accessible once debugfs has been mounted:
+
+        mount -t debugfs none /sys/kernel/debug
+
+The following program demonstrates kcov usage from within a test program:
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define KCOV_INIT_TRACE			_IOR('c', 1, unsigned long)
+#define KCOV_ENABLE			_IO('c', 100)
+#define KCOV_DISABLE			_IO('c', 101)
+#define COVER_SIZE			(64<<10)
+
+int main(int argc, char **argv)
+{
+	int fd;
+	unsigned long *cover, n, i;
+
+	/* A single fd descriptor allows coverage collection on a single
+	 * thread.
+	 */
+	fd = open("/sys/kernel/debug/kcov", O_RDWR);
+	if (fd == -1)
+		perror("open"), exit(1);
+	/* Setup trace mode and trace size. */
+	if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE))
+		perror("ioctl"), exit(1);
+	/* Mmap buffer shared between kernel- and user-space. */
+	cover = (unsigned long*)mmap(NULL, COVER_SIZE * sizeof(unsigned long),
+				     PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if ((void*)cover == MAP_FAILED)
+		perror("mmap"), exit(1);
+	/* Enable coverage collection on the current thread. */
+	if (ioctl(fd, KCOV_ENABLE, 0))
+		perror("ioctl"), exit(1);
+	/* Reset coverage from the tail of the ioctl() call. */
+	__atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED);
+	/* That's the target syscal call. */
+	read(-1, NULL, 0);
+	/* Read number of PCs collected. */
+	n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED);
+	for (i = 0; i < n; i++)
+		printf("0x%lx\n", cover[i + 1]);
+	/* Disable coverage collection for the current thread. After this call
+	 * coverage can be enabled for a different thread.
+	 */
+	if (ioctl(fd, KCOV_DISABLE, 0))
+		perror("ioctl"), exit(1);
+	/* Free resources. */
+	if (munmap(cover, COVER_SIZE * sizeof(unsigned long)))
+		perror("munmap"), exit(1);
+	if (close(fd))
+		perror("close"), exit(1);
+	return 0;
+}
+
+After piping through addr2line output of the program looks as follows:
+
+SyS_read
+fs/read_write.c:562
+__fdget_pos
+fs/file.c:774
+__fget_light
+fs/file.c:746
+__fget_light
+fs/file.c:750
+__fget_light
+fs/file.c:760
+__fdget_pos
+fs/file.c:784
+SyS_read
+fs/read_write.c:562
+
+If a program needs to collect coverage from several threads (independently),
+it needs to open /sys/kernel/debug/kcov in each thread separately.
+
+The interface is fine-grained to allow efficient forking of test processes.
+That is, a parent process opens /sys/kernel/debug/kcov, enables trace mode,
+mmaps coverage buffer and then forks child processes in a loop. Child processes
+only need to enable coverage (disable happens automatically on thread end).
diff --git a/Makefile b/Makefile
index e055b969c325..b98a4f70d1b5 100644
--- a/Makefile
+++ b/Makefile
@@ -365,6 +365,7 @@ LDFLAGS_MODULE  =
 CFLAGS_KERNEL	=
 AFLAGS_KERNEL	=
 CFLAGS_GCOV	= -fprofile-arcs -ftest-coverage
+CFLAGS_KCOV	= -fsanitize-coverage=trace-pc
 
 
 # Use USERINCLUDE when you must reference the UAPI directories only.
@@ -411,7 +412,7 @@ export MAKE AWK GENKSYMS INSTALLKERNEL PERL PYTHON UTS_MACHINE
 export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
-export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN CFLAGS_UBSAN
+export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KCOV CFLAGS_KASAN CFLAGS_UBSAN
 export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
 export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
 export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
@@ -673,6 +674,14 @@ endif
 endif
 KBUILD_CFLAGS += $(stackp-flag)
 
+ifdef CONFIG_KCOV
+  ifeq ($(call cc-option, $(CFLAGS_KCOV)),)
+    $(warning Cannot use CONFIG_KCOV: \
+             -fsanitize-coverage=trace-pc is not supported by compiler)
+    CFLAGS_KCOV =
+  endif
+endif
+
 ifeq ($(cc-name),clang)
 KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
 KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8b680a5cb25b..54478b7635de 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,7 @@ config X86
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_MMIO_FLUSH
 	select ARCH_HAS_SG_CHAIN
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 0bf6749522d9..b1ef9e489084 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -12,6 +12,13 @@
 KASAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Kernel does not boot with kcov instrumentation here.
+# One of the problems observed was insertion of __sanitizer_cov_trace_pc()
+# callback into middle of per-cpu data enabling code. Thus the callback observed
+# inconsistent state and crashed. We are interested mostly in syscall coverage,
+# so boot code is not interesting anyway.
+KCOV_INSTRUMENT		:= n
+
 # If you want to preset the SVGA mode, uncomment the next line and
 # set SVGA_MODE to whatever number you want.
 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 5e1d26e09407..6915ff2bd996 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,6 +19,9 @@
 KASAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
+
 targets := vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma \
 	vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4
 
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index f9fb859c98b9..6874da5f67fc 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -7,6 +7,9 @@ KASAN_SANITIZE			:= n
 UBSAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
+
 VDSO64-$(CONFIG_X86_64)		:= y
 VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
 VDSO32-$(CONFIG_X86_32)		:= y
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index d5fb0871aba3..adaae2c781c1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,6 +25,12 @@ OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o	:= y
 OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_test_nx.o			:= y
 
+# If instrumentation of this dir is enabled, boot hangs during first second.
+# Probably could be more selective here, but note that files related to irqs,
+# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
+# non-deterministic coverage.
+KCOV_INSTRUMENT		:= n
+
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
 obj-y			:= process_$(BITS).o signal.o
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 8bb12ddc5db8..8e63ebdcbd0b 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,6 +2,10 @@
 # Makefile for local APIC drivers and for the IO-APIC code
 #
 
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particualr, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT		:= n
+
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o ipi.o vector.o
 obj-y				+= hw_nmi.o
 
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 0d373d7affc8..4a8697f7d4ef 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg
 CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+
 # Make sure load_percpu_segment has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_common.o		:= $(nostackp)
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index a501fa25da41..72a576752a7e 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,6 +2,9 @@
 # Makefile for x86 specific library files.
 #
 
+# Produces uninteresting flaky coverage.
+KCOV_INSTRUMENT_delay.o	:= n
+
 inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
 inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
 quiet_cmd_inat_tables = GEN     $@
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 67cf2e1e557b..f98913258c63 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,3 +1,6 @@
+# Kernel does not boot with instrumentation of tlb.c.
+KCOV_INSTRUMENT_tlb.o	:= n
+
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
 	    pat.o pgtable.o physaddr.o gup.o setup_nx.o
 
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index 053abe7b0ef7..b95964610ea7 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -9,6 +9,9 @@
 KASAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
+
 always := realmode.bin realmode.relocs
 
 wakeup-objs	:= wakeup_asm.o wakemain.o video-mode.o
diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile
index a15841eced4e..da99bbb74aeb 100644
--- a/drivers/firmware/efi/libstub/Makefile
+++ b/drivers/firmware/efi/libstub/Makefile
@@ -25,6 +25,9 @@ KASAN_SANITIZE			:= n
 UBSAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT			:= n
+
 lib-y				:= efi-stub-helper.o
 
 # include the stub's generic dependencies from lib/ when building for ARM/arm64
diff --git a/include/linux/kcov.h b/include/linux/kcov.h
new file mode 100644
index 000000000000..2883ac98c280
--- /dev/null
+++ b/include/linux/kcov.h
@@ -0,0 +1,29 @@
+#ifndef _LINUX_KCOV_H
+#define _LINUX_KCOV_H
+
+#include <uapi/linux/kcov.h>
+
+struct task_struct;
+
+#ifdef CONFIG_KCOV
+
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
+enum kcov_mode {
+	/* Coverage collection is not enabled yet. */
+	KCOV_MODE_DISABLED = 0,
+	/*
+	 * Tracing coverage collection mode.
+	 * Covered PCs are collected in a per-task buffer.
+	 */
+	KCOV_MODE_TRACE = 1,
+};
+
+#else
+
+static inline void kcov_task_init(struct task_struct *t) {}
+static inline void kcov_task_exit(struct task_struct *t) {}
+
+#endif /* CONFIG_KCOV */
+#endif /* _LINUX_KCOV_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 084ed9fba620..34495d2d2d7b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -51,6 +51,7 @@ struct sched_param {
 #include <linux/resource.h>
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
+#include <linux/kcov.h>
 #include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
 #include <linux/cred.h>
@@ -1818,6 +1819,16 @@ struct task_struct {
 	/* bitmask and counter of trace recursion */
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
+#ifdef CONFIG_KCOV
+	/* Coverage collection mode enabled for this task (0 if disabled). */
+	enum kcov_mode kcov_mode;
+	/* Size of the kcov_area. */
+	unsigned	kcov_size;
+	/* Buffer for coverage collection. */
+	void		*kcov_area;
+	/* kcov desciptor wired with this task or NULL. */
+	struct kcov	*kcov;
+#endif
 #ifdef CONFIG_MEMCG
 	struct mem_cgroup *memcg_in_oom;
 	gfp_t memcg_oom_gfp_mask;
diff --git a/include/uapi/linux/kcov.h b/include/uapi/linux/kcov.h
new file mode 100644
index 000000000000..574e22ec640d
--- /dev/null
+++ b/include/uapi/linux/kcov.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_KCOV_IOCTLS_H
+#define _LINUX_KCOV_IOCTLS_H
+
+#include <linux/types.h>
+
+#define KCOV_INIT_TRACE			_IOR('c', 1, unsigned long)
+#define KCOV_ENABLE			_IO('c', 100)
+#define KCOV_DISABLE			_IO('c', 101)
+
+#endif /* _LINUX_KCOV_IOCTLS_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index baa55e50a315..f0c40bf49d9f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -18,6 +18,17 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
 endif
 
+# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
+# in coverage traces.
+KCOV_INSTRUMENT_softirq.o := n
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+KCOV_INSTRUMENT_extable.o := n
+# Don't self-instrument.
+KCOV_INSTRUMENT_kcov.o := n
+KASAN_SANITIZE_kcov.o := n
+
 # cond_syscall is currently not LTO compatible
 CFLAGS_sys_ni.o = $(DISABLE_LTO)
 
@@ -68,6 +79,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_KCOV) += kcov.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
diff --git a/kernel/exit.c b/kernel/exit.c
index 10e088237fed..953d1a1c0387 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -53,6 +53,7 @@
 #include <linux/oom.h>
 #include <linux/writeback.h>
 #include <linux/shm.h>
+#include <linux/kcov.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -655,6 +656,7 @@ void do_exit(long code)
 	TASKS_RCU(int tasks_rcu_i);
 
 	profile_task_exit(tsk);
+	kcov_task_exit(tsk);
 
 	WARN_ON(blk_needs_flush_plug(tsk));
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 5b8d1e7ceeea..d277e83ed3e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -75,6 +75,7 @@
 #include <linux/aio.h>
 #include <linux/compiler.h>
 #include <linux/sysctl.h>
+#include <linux/kcov.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -392,6 +393,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 
 	account_kernel_stack(ti, 1);
 
+	kcov_task_init(tsk);
+
 	return tsk;
 
 free_ti:
diff --git a/kernel/kcov.c b/kernel/kcov.c
new file mode 100644
index 000000000000..3efbee0834a8
--- /dev/null
+++ b/kernel/kcov.c
@@ -0,0 +1,273 @@
+#define pr_fmt(fmt) "kcov: " fmt
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/kcov.h>
+
+/*
+ * kcov descriptor (one per opened debugfs file).
+ * State transitions of the descriptor:
+ *  - initial state after open()
+ *  - then there must be a single ioctl(KCOV_INIT_TRACE) call
+ *  - then, mmap() call (several calls are allowed but not useful)
+ *  - then, repeated enable/disable for a task (only one task a time allowed)
+ */
+struct kcov {
+	/*
+	 * Reference counter. We keep one for:
+	 *  - opened file descriptor
+	 *  - task with enabled coverage (we can't unwire it from another task)
+	 */
+	atomic_t		refcount;
+	/* The lock protects mode, size, area and t. */
+	spinlock_t		lock;
+	enum kcov_mode		mode;
+	/* Size of arena (in long's for KCOV_MODE_TRACE). */
+	unsigned		size;
+	/* Coverage buffer shared with user space. */
+	void			*area;
+	/* Task for which we collect coverage, or NULL. */
+	struct task_struct	*t;
+};
+
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void __sanitizer_cov_trace_pc(void)
+{
+	struct task_struct *t;
+	enum kcov_mode mode;
+
+	t = current;
+	/*
+	 * We are interested in code coverage as a function of a syscall inputs,
+	 * so we ignore code executed in interrupts.
+	 */
+	if (!t || in_interrupt())
+		return;
+	mode = READ_ONCE(t->kcov_mode);
+	if (mode == KCOV_MODE_TRACE) {
+		unsigned long *area;
+		unsigned long pos;
+
+		/*
+		 * There is some code that runs in interrupts but for which
+		 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+		 * READ_ONCE()/barrier() effectively provides load-acquire wrt
+		 * interrupts, there are paired barrier()/WRITE_ONCE() in
+		 * kcov_ioctl_locked().
+		 */
+		barrier();
+		area = t->kcov_area;
+		/* The first word is number of subsequent PCs. */
+		pos = READ_ONCE(area[0]) + 1;
+		if (likely(pos < t->kcov_size)) {
+			area[pos] = _RET_IP_;
+			WRITE_ONCE(area[0], pos);
+		}
+	}
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+
+static void kcov_get(struct kcov *kcov)
+{
+	atomic_inc(&kcov->refcount);
+}
+
+static void kcov_put(struct kcov *kcov)
+{
+	if (atomic_dec_and_test(&kcov->refcount)) {
+		vfree(kcov->area);
+		kfree(kcov);
+	}
+}
+
+void kcov_task_init(struct task_struct *t)
+{
+	t->kcov_mode = KCOV_MODE_DISABLED;
+	t->kcov_size = 0;
+	t->kcov_area = NULL;
+	t->kcov = NULL;
+}
+
+void kcov_task_exit(struct task_struct *t)
+{
+	struct kcov *kcov;
+
+	kcov = t->kcov;
+	if (kcov == NULL)
+		return;
+	spin_lock(&kcov->lock);
+	if (WARN_ON(kcov->t != t)) {
+		spin_unlock(&kcov->lock);
+		return;
+	}
+	/* Just to not leave dangling references behind. */
+	kcov_task_init(t);
+	kcov->t = NULL;
+	spin_unlock(&kcov->lock);
+	kcov_put(kcov);
+}
+
+static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+	int res = 0;
+	void *area;
+	struct kcov *kcov = vma->vm_file->private_data;
+	unsigned long size, off;
+	struct page *page;
+
+	area = vmalloc_user(vma->vm_end - vma->vm_start);
+	if (!area)
+		return -ENOMEM;
+
+	spin_lock(&kcov->lock);
+	size = kcov->size * sizeof(unsigned long);
+	if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+	    vma->vm_end - vma->vm_start != size) {
+		res = -EINVAL;
+		goto exit;
+	}
+	if (!kcov->area) {
+		kcov->area = area;
+		vma->vm_flags |= VM_DONTEXPAND;
+		spin_unlock(&kcov->lock);
+		for (off = 0; off < size; off += PAGE_SIZE) {
+			page = vmalloc_to_page(kcov->area + off);
+			if (vm_insert_page(vma, vma->vm_start + off, page))
+				WARN_ONCE(1, "vm_insert_page() failed");
+		}
+		return 0;
+	}
+exit:
+	spin_unlock(&kcov->lock);
+	vfree(area);
+	return res;
+}
+
+static int kcov_open(struct inode *inode, struct file *filep)
+{
+	struct kcov *kcov;
+
+	kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
+	if (!kcov)
+		return -ENOMEM;
+	atomic_set(&kcov->refcount, 1);
+	spin_lock_init(&kcov->lock);
+	filep->private_data = kcov;
+	return nonseekable_open(inode, filep);
+}
+
+static int kcov_close(struct inode *inode, struct file *filep)
+{
+	kcov_put(filep->private_data);
+	return 0;
+}
+
+static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
+			     unsigned long arg)
+{
+	struct task_struct *t;
+	unsigned long size, unused;
+
+	switch (cmd) {
+	case KCOV_INIT_TRACE:
+		/*
+		 * Enable kcov in trace mode and setup buffer size.
+		 * Must happen before anything else.
+		 */
+		if (kcov->mode != KCOV_MODE_DISABLED)
+			return -EBUSY;
+		/*
+		 * Size must be at least 2 to hold current position and one PC.
+		 * Later we allocate size * sizeof(unsigned long) memory,
+		 * that must not overflow.
+		 */
+		size = arg;
+		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+			return -EINVAL;
+		kcov->size = size;
+		kcov->mode = KCOV_MODE_TRACE;
+		return 0;
+	case KCOV_ENABLE:
+		/*
+		 * Enable coverage for the current task.
+		 * At this point user must have been enabled trace mode,
+		 * and mmapped the file. Coverage collection is disabled only
+		 * at task exit or voluntary by KCOV_DISABLE. After that it can
+		 * be enabled for another task.
+		 */
+		unused = arg;
+		if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
+		    kcov->area == NULL)
+			return -EINVAL;
+		if (kcov->t != NULL)
+			return -EBUSY;
+		t = current;
+		/* Cache in task struct for performance. */
+		t->kcov_size = kcov->size;
+		t->kcov_area = kcov->area;
+		/* See comment in __sanitizer_cov_trace_pc(). */
+		barrier();
+		WRITE_ONCE(t->kcov_mode, kcov->mode);
+		t->kcov = kcov;
+		kcov->t = t;
+		/* This is put either in kcov_task_exit() or in KCOV_DISABLE. */
+		kcov_get(kcov);
+		return 0;
+	case KCOV_DISABLE:
+		/* Disable coverage for the current task. */
+		unused = arg;
+		if (unused != 0 || current->kcov != kcov)
+			return -EINVAL;
+		t = current;
+		if (WARN_ON(kcov->t != t))
+			return -EINVAL;
+		kcov_task_init(t);
+		kcov->t = NULL;
+		kcov_put(kcov);
+		return 0;
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	struct kcov *kcov;
+	int res;
+
+	kcov = filep->private_data;
+	spin_lock(&kcov->lock);
+	res = kcov_ioctl_locked(kcov, cmd, arg);
+	spin_unlock(&kcov->lock);
+	return res;
+}
+
+static const struct file_operations kcov_fops = {
+	.open		= kcov_open,
+	.unlocked_ioctl	= kcov_ioctl,
+	.mmap		= kcov_mmap,
+	.release        = kcov_close,
+};
+
+static int __init kcov_init(void)
+{
+	if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+		pr_err("failed to create kcov in debugfs\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+device_initcall(kcov_init);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 8e96f6cc2a4a..31322a4275cd 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,3 +1,6 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT		:= n
 
 obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
 
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 61a16569ffbf..032b2c015beb 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,3 +1,7 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
+
 obj-y += update.o sync.o
 obj-$(CONFIG_SRCU) += srcu.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 302d6ebd64f7..414d9c16da42 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
 endif
 
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5a60f45cd9bb..532d4d52d1df 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -696,6 +696,27 @@ source "lib/Kconfig.kasan"
 
 endmenu # "Memory Debugging"
 
+config ARCH_HAS_KCOV
+	bool
+	help
+	  KCOV does not have any arch-specific code, but currently it is enabled
+	  only for x86_64. KCOV requires testing on other archs, and most likely
+	  disabling of instrumentation for some early boot code.
+
+config KCOV
+	bool "Code coverage for fuzzing"
+	depends on ARCH_HAS_KCOV
+	select DEBUG_FS
+	help
+	  KCOV exposes kernel code coverage information in a form suitable
+	  for coverage-guided fuzzing (randomized testing).
+
+	  If RANDOMIZE_BASE is enabled, PC values will not be stable across
+	  different machines and across reboots. If you need stable PC values,
+	  disable RANDOMIZE_BASE.
+
+	  For more details, see Documentation/kcov.txt.
+
 config DEBUG_SHIRQ
 	bool "Debug shared IRQ handlers"
 	depends on DEBUG_KERNEL
diff --git a/lib/Makefile b/lib/Makefile
index 4962d14c450f..a1de5b61ff40 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -7,6 +7,18 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst $(CC_FLAGS_FTRACE),,$(ORIG_CFLAGS))
 endif
 
+# These files are disabled because they produce lots of non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. For example,
+# rbtree can be global and individual rotations don't correlate with inputs.
+KCOV_INSTRUMENT_string.o := n
+KCOV_INSTRUMENT_rbtree.o := n
+KCOV_INSTRUMENT_list_debug.o := n
+KCOV_INSTRUMENT_debugobjects.o := n
+KCOV_INSTRUMENT_dynamic_debug.o := n
+# Kernel does not boot if we instrument this file as it uses custom calling
+# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
+KCOV_INSTRUMENT_hweight.o := n
+
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
 	 idr.o int_sqrt.o extable.o \
diff --git a/mm/Makefile b/mm/Makefile
index 6da300a1414b..f5e797cbd128 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,6 +5,21 @@
 KASAN_SANITIZE_slab_common.o := n
 KASAN_SANITIZE_slub.o := n
 
+# These files are disabled because they produce non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
+# free pages, or a task is migrated between nodes.
+KCOV_INSTRUMENT_slab_common.o := n
+KCOV_INSTRUMENT_slob.o := n
+KCOV_INSTRUMENT_slab.o := n
+KCOV_INSTRUMENT_slub.o := n
+KCOV_INSTRUMENT_page_alloc.o := n
+KCOV_INSTRUMENT_debug-pagealloc.o := n
+KCOV_INSTRUMENT_kmemleak.o := n
+KCOV_INSTRUMENT_kmemcheck.o := n
+KCOV_INSTRUMENT_memcontrol.o := n
+KCOV_INSTRUMENT_mmzone.o := n
+KCOV_INSTRUMENT_vmstat.o := n
+
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index a61460d9f5b0..131daadf40e4 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,5 +1,6 @@
 KASAN_SANITIZE := n
 UBSAN_SANITIZE_kasan.o := n
+KCOV_INSTRUMENT := n
 
 CFLAGS_REMOVE_kasan.o = -pg
 # Function splitter causes unnecessary splits in __asan_load1/__asan_store1
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index ad50d5859ac4..ddf83d0181e7 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -136,6 +136,12 @@ _c_flags += $(if $(patsubst n%,, \
 		$(CFLAGS_UBSAN))
 endif
 
+ifeq ($(CONFIG_KCOV),y)
+_c_flags += $(if $(patsubst n%,, \
+	$(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)y), \
+	$(CFLAGS_KCOV))
+endif
+
 # If building the kernel in a separate objtree expand all occurrences
 # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/').
 
-- 
cgit v1.2.3


From 743bc4b069517d3bae69349a1a23511eaaae5ede Mon Sep 17 00:00:00 2001
From: John Youn <johnyoun@synopsys.com>
Date: Mon, 28 Mar 2016 16:12:21 -0700
Subject: usb: ch9: Fix SSP Device Cap wFunctionalitySupport type

The wFunctionalitySupport field should be __le16.

Signed-off-by: John Youn <johnyoun@synopsys.com>
Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 include/uapi/linux/usb/ch9.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h
index 06d6c6228a7a..d5ce71607972 100644
--- a/include/uapi/linux/usb/ch9.h
+++ b/include/uapi/linux/usb/ch9.h
@@ -899,7 +899,7 @@ struct usb_ssp_cap_descriptor {
 	__le32 bmAttributes;
 #define USB_SSP_SUBLINK_SPEED_ATTRIBS	(0x1f << 0) /* sublink speed entries */
 #define USB_SSP_SUBLINK_SPEED_IDS	(0xf << 5)  /* speed ID entries */
-	__u16  wFunctionalitySupport;
+	__le16  wFunctionalitySupport;
 #define USB_SSP_MIN_SUBLINK_SPEED_ATTRIBUTE_ID	(0xf)
 #define USB_SSP_MIN_RX_LANE_COUNT		(0xf << 8)
 #define USB_SSP_MIN_TX_LANE_COUNT		(0xf << 12)
-- 
cgit v1.2.3


From 283d757378371e8044d873e661b1dccee46c5887 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Wed, 30 Mar 2016 00:14:57 +0200
Subject: uapi/linux/stddef.h: Provide __always_inline to userspace headers

Josh Boyer reported that my recent change to uapi/linux/swab.h broke the Qemu build:

  bc27fb68aaad ("include/uapi/linux/byteorder, swab: force inlining of some byteswap operations")

Unfortunately, UAPI headers don't include compiler.h so fixing it there is not enough,
add an __always_inline definition to uapi/linux/stddef.h instead.

Testcase: "make headers_install" and try to compile this:

	#include <linux/swab.h>
	void main() {}

Reported-by: Josh Boyer <jwboyer@fedoraproject.org>
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/1459289697-12875-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/uapi/linux/stddef.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/stddef.h b/include/uapi/linux/stddef.h
index aa9f10428743..621fa8ac4425 100644
--- a/include/uapi/linux/stddef.h
+++ b/include/uapi/linux/stddef.h
@@ -1 +1,5 @@
 #include <linux/compiler.h>
+
+#ifndef __always_inline
+#define __always_inline inline
+#endif
-- 
cgit v1.2.3


From c0e760c9c66ebd8a5a1ede81868677f4df993dfb Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 30 Mar 2016 00:02:00 +0200
Subject: bpf: make padding in bpf_tunnel_key explicit

Make the 2 byte padding in struct bpf_tunnel_key between tunnel_ttl
and tunnel_label members explicit. No issue has been observed, and
gcc/llvm does padding for the old struct already, where tunnel_label
was not yet present, so the current code works, but since it's part
of uapi, make sure we don't introduce holes in structs.

Therefore, add tunnel_ext that we can use generically in future
(f.e. to flag OAM messages for backends, etc). Also add the offset
to the compat tests to be sure should some compilers not padd the
tail of the old version of bpf_tunnel_key.

Fixes: 4018ab1875e0 ("bpf: support flow label for bpf_skb_{set, get}_tunnel_key")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/bpf.h | 1 +
 net/core/filter.c        | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 924f537183fd..23917bb47bf3 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -375,6 +375,7 @@ struct bpf_tunnel_key {
 	};
 	__u8 tunnel_tos;
 	__u8 tunnel_ttl;
+	__u16 tunnel_ext;
 	__u32 tunnel_label;
 };
 
diff --git a/net/core/filter.c b/net/core/filter.c
index b7177d01ecb0..4b81b71171b4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1764,6 +1764,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
 		case offsetof(struct bpf_tunnel_key, tunnel_label):
+		case offsetof(struct bpf_tunnel_key, tunnel_ext):
 			goto set_compat;
 		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
 			/* Fixup deprecated structure layouts here, so we have
@@ -1849,6 +1850,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 	if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
 		switch (size) {
 		case offsetof(struct bpf_tunnel_key, tunnel_label):
+		case offsetof(struct bpf_tunnel_key, tunnel_ext):
 		case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
 			/* Fixup deprecated structure layouts here, so we have
 			 * a common path later on.
@@ -1861,7 +1863,8 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
 			return -EINVAL;
 		}
 	}
-	if (unlikely(!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label))
+	if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
+		     from->tunnel_ext))
 		return -EINVAL;
 
 	skb_dst_drop(skb);
-- 
cgit v1.2.3


From c00bbcf8628969e103d4a7b351a53746f1025576 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@redhat.com>
Date: Tue, 29 Mar 2016 16:43:45 +0100
Subject: virtio: add VIRTIO_CONFIG_S_NEEDS_RESET device status bit

The VIRTIO 1.0 specification added the DEVICE_NEEDS_RESET device status
bit in "VIRTIO-98: Add DEVICE_NEEDS_RESET".  This patch defines the
device status bit in the uapi header file so that both the kernel and
userspace applications can use it.

The bit is currently unused by the virtio guest drivers and vhost.
According to the spec "a good implementation will try to recover by
issuing a reset".  This is not attempted here because it requires
auditing the virtio drivers to ensure there are no resource leaks or
crashes if the device needs to be reset mid-operation.

See "2.1 Device Status Field" in the VIRTIO 1.0 specification for
details.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/uapi/linux/virtio_config.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/uapi/linux')

diff --git a/include/uapi/linux/virtio_config.h b/include/uapi/linux/virtio_config.h
index c18264df9504..4cb65bbfa654 100644
--- a/include/uapi/linux/virtio_config.h
+++ b/include/uapi/linux/virtio_config.h
@@ -40,6 +40,8 @@
 #define VIRTIO_CONFIG_S_DRIVER_OK	4
 /* Driver has finished configuring features */
 #define VIRTIO_CONFIG_S_FEATURES_OK	8
+/* Device entered invalid state, driver must reset it */
+#define VIRTIO_CONFIG_S_NEEDS_RESET	0x40
 /* We've given up on this device. */
 #define VIRTIO_CONFIG_S_FAILED		0x80
 
-- 
cgit v1.2.3