From 069e2b351de67e7a837b15b3d26c65c19b790cc3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Fri, 14 Jun 2013 19:55:13 +0000
Subject: slob: Rework #ifdeffery in slab.h

Make the SLOB specific stuff harmonize more with the way the other allocators
do it. Create the typical kmalloc constants for that purpose. SLOB does not
support it but the constants help us avoid #ifdefs.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab.h | 39 ++++++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0c621752caa6..9690c14eb7fb 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -169,11 +169,7 @@ struct kmem_cache {
 	struct list_head list;	/* List of all slab caches on the system */
 };
 
-#define KMALLOC_MAX_SIZE (1UL << 30)
-
-#include <linux/slob_def.h>
-
-#else /* CONFIG_SLOB */
+#endif /* CONFIG_SLOB */
 
 /*
  * Kmalloc array related definitions
@@ -195,7 +191,9 @@ struct kmem_cache {
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	5
 #endif
-#else
+#endif
+
+#ifdef CONFIG_SLUB
 /*
  * SLUB allocates up to order 2 pages directly and otherwise
  * passes the request to the page allocator.
@@ -207,6 +205,19 @@ struct kmem_cache {
 #endif
 #endif
 
+#ifdef CONFIG_SLOB
+/*
+ * SLOB passes all page size and larger requests to the page allocator.
+ * No kmalloc array is necessary since objects of different sizes can
+ * be allocated from the same page.
+ */
+#define KMALLOC_SHIFT_MAX	30
+#define KMALLOC_SHIFT_HIGH	PAGE_SHIFT
+#ifndef KMALLOC_SHIFT_LOW
+#define KMALLOC_SHIFT_LOW	3
+#endif
+#endif
+
 /* Maximum allocatable size */
 #define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_MAX)
 /* Maximum size for which we actually use a slab cache */
@@ -221,6 +232,7 @@ struct kmem_cache {
 #define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
 #endif
 
+#ifndef CONFIG_SLOB
 extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
 #ifdef CONFIG_ZONE_DMA
 extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
@@ -275,13 +287,18 @@ static __always_inline int kmalloc_index(size_t size)
 	/* Will never be reached. Needed because the compiler may complain */
 	return -1;
 }
+#endif /* !CONFIG_SLOB */
 
 #ifdef CONFIG_SLAB
 #include <linux/slab_def.h>
-#elif defined(CONFIG_SLUB)
+#endif
+
+#ifdef CONFIG_SLUB
 #include <linux/slub_def.h>
-#else
-#error "Unknown slab allocator"
+#endif
+
+#ifdef CONFIG_SLOB
+#include <linux/slob_def.h>
 #endif
 
 /*
@@ -291,6 +308,7 @@ static __always_inline int kmalloc_index(size_t size)
  */
 static __always_inline int kmalloc_size(int n)
 {
+#ifndef CONFIG_SLOB
 	if (n > 2)
 		return 1 << n;
 
@@ -299,10 +317,9 @@ static __always_inline int kmalloc_size(int n)
 
 	if (n == 2 && KMALLOC_MIN_SIZE <= 64)
 		return 192;
-
+#endif
 	return 0;
 }
-#endif /* !CONFIG_SLOB */
 
 /*
  * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
-- 
cgit v1.2.3


From 3c00ea82c724fab0b98f15428a804cb45eb9ad38 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 19 May 2013 20:45:15 +0200
Subject: watchdog: Rename confusing state variable

We have two very conflicting state variable names in the
watchdog:

* watchdog_enabled: This one reflects the user interface. It's
set to 1 by default and can be overriden with boot options
or sysctl/procfs interface.

* watchdog_disabled: This is the internal toggle state that
tells if watchdog threads, timers and NMI events are currently
running or not. This state mostly depends on the user settings.
It's a convenient state latch.

Now we really need to find clearer names because those
are just too confusing to encourage deep review.

watchdog_enabled now becomes watchdog_user_enabled to reflect
its purpose as an interface.

watchdog_disabled becomes watchdog_running to suggest its
role as a pure internal state.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Anish Singh <anish198519851985@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Don Zickus <dzickus@redhat.com>
---
 include/linux/nmi.h |  2 +-
 kernel/sysctl.c     |  4 ++--
 kernel/watchdog.c   | 30 +++++++++++++++---------------
 3 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index db50840e6355..6a45fb583ff1 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -46,7 +46,7 @@ static inline bool trigger_all_cpu_backtrace(void)
 #ifdef CONFIG_LOCKUP_DETECTOR
 int hw_nmi_is_cpu_stuck(struct pt_regs *);
 u64 hw_nmi_get_sample_period(int watchdog_thresh);
-extern int watchdog_enabled;
+extern int watchdog_user_enabled;
 extern int watchdog_thresh;
 struct ctl_table;
 extern int proc_dowatchdog(struct ctl_table *, int ,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9edcf456e0fc..b0805652c4ff 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -801,7 +801,7 @@ static struct ctl_table kern_table[] = {
 #if defined(CONFIG_LOCKUP_DETECTOR)
 	{
 		.procname       = "watchdog",
-		.data           = &watchdog_enabled,
+		.data           = &watchdog_user_enabled,
 		.maxlen         = sizeof (int),
 		.mode           = 0644,
 		.proc_handler   = proc_dowatchdog,
@@ -828,7 +828,7 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.procname       = "nmi_watchdog",
-		.data           = &watchdog_enabled,
+		.data           = &watchdog_user_enabled,
 		.maxlen         = sizeof (int),
 		.mode           = 0644,
 		.proc_handler   = proc_dowatchdog,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 52c9a9b91bdd..51c4f34d258e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -29,9 +29,9 @@
 #include <linux/kvm_para.h>
 #include <linux/perf_event.h>
 
-int watchdog_enabled = 1;
+int watchdog_user_enabled = 1;
 int __read_mostly watchdog_thresh = 10;
-static int __read_mostly watchdog_disabled = 1;
+static int __read_mostly watchdog_running;
 static u64 __read_mostly sample_period;
 
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -63,7 +63,7 @@ static int __init hardlockup_panic_setup(char *str)
 	else if (!strncmp(str, "nopanic", 7))
 		hardlockup_panic = 0;
 	else if (!strncmp(str, "0", 1))
-		watchdog_enabled = 0;
+		watchdog_user_enabled = 0;
 	return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -82,7 +82,7 @@ __setup("softlockup_panic=", softlockup_panic_setup);
 
 static int __init nowatchdog_setup(char *str)
 {
-	watchdog_enabled = 0;
+	watchdog_user_enabled = 0;
 	return 1;
 }
 __setup("nowatchdog", nowatchdog_setup);
@@ -90,7 +90,7 @@ __setup("nowatchdog", nowatchdog_setup);
 /* deprecated */
 static int __init nosoftlockup_setup(char *str)
 {
-	watchdog_enabled = 0;
+	watchdog_user_enabled = 0;
 	return 1;
 }
 __setup("nosoftlockup", nosoftlockup_setup);
@@ -158,7 +158,7 @@ void touch_all_softlockup_watchdogs(void)
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 void touch_nmi_watchdog(void)
 {
-	if (watchdog_enabled) {
+	if (watchdog_user_enabled) {
 		unsigned cpu;
 
 		for_each_present_cpu(cpu) {
@@ -490,12 +490,12 @@ static int watchdog_enable_all_cpus(void)
 {
 	int err = 0;
 
-	if (watchdog_disabled) {
+	if (!watchdog_running) {
 		err = smpboot_register_percpu_thread(&watchdog_threads);
 		if (err)
 			pr_err("Failed to create watchdog threads, disabled\n");
 		else
-			watchdog_disabled = 0;
+			watchdog_running = 1;
 	}
 
 	return err;
@@ -506,8 +506,8 @@ static int watchdog_enable_all_cpus(void)
 #ifdef CONFIG_SYSCTL
 static void watchdog_disable_all_cpus(void)
 {
-	if (!watchdog_disabled) {
-		watchdog_disabled = 1;
+	if (watchdog_running) {
+		watchdog_running = 0;
 		smpboot_unregister_percpu_thread(&watchdog_threads);
 	}
 }
@@ -522,7 +522,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
 	int err, old_thresh, old_enabled;
 
 	old_thresh = ACCESS_ONCE(watchdog_thresh);
-	old_enabled = ACCESS_ONCE(watchdog_enabled);
+	old_enabled = ACCESS_ONCE(watchdog_user_enabled);
 
 	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 	if (err || !write)
@@ -531,10 +531,10 @@ int proc_dowatchdog(struct ctl_table *table, int write,
 	set_sample_period();
 	/*
 	 * Watchdog threads shouldn't be enabled if they are
-	 * disabled. The 'watchdog_disabled' variable check in
+	 * disabled. The 'watchdog_running' variable check in
 	 * watchdog_*_all_cpus() function takes care of this.
 	 */
-	if (watchdog_enabled && watchdog_thresh)
+	if (watchdog_user_enabled && watchdog_thresh)
 		err = watchdog_enable_all_cpus();
 	else
 		watchdog_disable_all_cpus();
@@ -542,7 +542,7 @@ int proc_dowatchdog(struct ctl_table *table, int write,
 	/* Restore old values on failure */
 	if (err) {
 		watchdog_thresh = old_thresh;
-		watchdog_enabled = old_enabled;
+		watchdog_user_enabled = old_enabled;
 	}
 
 	return err;
@@ -553,6 +553,6 @@ void __init lockup_detector_init(void)
 {
 	set_sample_period();
 
-	if (watchdog_enabled)
+	if (watchdog_user_enabled)
 		watchdog_enable_all_cpus();
 }
-- 
cgit v1.2.3


From 8d36eb01da5d371feffa280e501377b5c450f5a5 Mon Sep 17 00:00:00 2001
From: Sean Hefty <sean.hefty@intel.com>
Date: Wed, 29 May 2013 10:09:07 -0700
Subject: RDMA/cma: Define native IB address

Define AF_IB and sockaddr_ib to allow the rdma_cm to use native IB
addressing.

Signed-off-by: Sean Hefty <sean.hefty@intel.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 include/linux/socket.h |  2 ++
 include/rdma/ib.h      | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100644 include/rdma/ib.h

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index b10ce4b341ea..230c04bda3e2 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -167,6 +167,7 @@ struct ucred {
 #define AF_PPPOX	24	/* PPPoX sockets		*/
 #define AF_WANPIPE	25	/* Wanpipe API Sockets */
 #define AF_LLC		26	/* Linux LLC			*/
+#define AF_IB		27	/* Native InfiniBand address	*/
 #define AF_CAN		29	/* Controller Area Network      */
 #define AF_TIPC		30	/* TIPC sockets			*/
 #define AF_BLUETOOTH	31	/* Bluetooth sockets 		*/
@@ -211,6 +212,7 @@ struct ucred {
 #define PF_PPPOX	AF_PPPOX
 #define PF_WANPIPE	AF_WANPIPE
 #define PF_LLC		AF_LLC
+#define PF_IB		AF_IB
 #define PF_CAN		AF_CAN
 #define PF_TIPC		AF_TIPC
 #define PF_BLUETOOTH	AF_BLUETOOTH
diff --git a/include/rdma/ib.h b/include/rdma/ib.h
new file mode 100644
index 000000000000..cf8f9e700e48
--- /dev/null
+++ b/include/rdma/ib.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2010 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#if !defined(_RDMA_IB_H)
+#define _RDMA_IB_H
+
+#include <linux/types.h>
+
+struct ib_addr {
+	union {
+		__u8		uib_addr8[16];
+		__be16		uib_addr16[8];
+		__be32		uib_addr32[4];
+		__be64		uib_addr64[2];
+	} ib_u;
+#define sib_addr8		ib_u.uib_addr8
+#define sib_addr16		ib_u.uib_addr16
+#define sib_addr32		ib_u.uib_addr32
+#define sib_addr64		ib_u.uib_addr64
+#define sib_raw			ib_u.uib_addr8
+#define sib_subnet_prefix	ib_u.uib_addr64[0]
+#define sib_interface_id	ib_u.uib_addr64[1]
+};
+
+static inline int ib_addr_any(const struct ib_addr *a)
+{
+	return ((a->sib_addr64[0] | a->sib_addr64[1]) == 0);
+}
+
+static inline int ib_addr_loopback(const struct ib_addr *a)
+{
+	return ((a->sib_addr32[0] | a->sib_addr32[1] |
+		 a->sib_addr32[2] | (a->sib_addr32[3] ^ htonl(1))) == 0);
+}
+
+static inline void ib_addr_set(struct ib_addr *addr,
+			       __be32 w1, __be32 w2, __be32 w3, __be32 w4)
+{
+	addr->sib_addr32[0] = w1;
+	addr->sib_addr32[1] = w2;
+	addr->sib_addr32[2] = w3;
+	addr->sib_addr32[3] = w4;
+}
+
+static inline int ib_addr_cmp(const struct ib_addr *a1, const struct ib_addr *a2)
+{
+	return memcmp(a1, a2, sizeof(struct ib_addr));
+}
+
+struct sockaddr_ib {
+	unsigned short int	sib_family;	/* AF_IB */
+	__be16			sib_pkey;
+	__be32			sib_flowinfo;
+	struct ib_addr		sib_addr;
+	__be64			sib_sid;
+	__be64			sib_sid_mask;
+	__u64			sib_scope_id;
+};
+
+#endif /* _RDMA_IB_H */
-- 
cgit v1.2.3


From 1c297a66654a3295ae87e2b7f3724d214eb2b5ec Mon Sep 17 00:00:00 2001
From: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Date: Mon, 1 Jul 2013 15:20:00 +0100
Subject: iio: Fix iio_channel_has_info

Since the info_mask split, iio_channel_has_info() is not working correctly.
info_mask_separate and info_mask_shared_by_type, it is not possible to compare
them directly with the iio_chan_info_enum enum. Correct that bit using the BIT()
macro.

Cc: <stable@vger.kernel.org> # 3.10.x
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 include/linux/iio/iio.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 8d171f427632..3d35b7023591 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -211,8 +211,8 @@ struct iio_chan_spec {
 static inline bool iio_channel_has_info(const struct iio_chan_spec *chan,
 	enum iio_chan_info_enum type)
 {
-	return (chan->info_mask_separate & type) |
-	       (chan->info_mask_shared_by_type & type);
+	return (chan->info_mask_separate & BIT(type)) |
+	       (chan->info_mask_shared_by_type & BIT(type));
 }
 
 #define IIO_ST(si, rb, sb, sh)						\
-- 
cgit v1.2.3


From e7efa615ccf78394338144ff0187be331240748a Mon Sep 17 00:00:00 2001
From: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Date: Tue, 25 Jun 2013 18:16:55 +0200
Subject: slab: add kmalloc() to kernel API documentation

At the moment, kmalloc() isn't even listed in the kernel API
documentation (DocBook/kernel-api.html after running "make htmldocs").

Another issue is that the documentation for kmalloc_node()
refers to kcalloc()'s documentation to describe its 'flags' parameter,
while kcalloc() refered to kmalloc()'s documentation, which doesn't exist!

This patch is a proposed fix for this. It also removes the documentation
for kmalloc() in include/linux/slob_def.h which isn't included to
generate the documentation anyway. This way, kmalloc() is described
in only one place.

Acked-by: Christoph Lameter <cl@linux.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Michael Opdenacker <michael.opdenacker@free-electrons.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab.h     | 18 ++++++++++++++----
 include/linux/slob_def.h |  8 --------
 2 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 9690c14eb7fb..6c5cc0ea8713 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -373,9 +373,8 @@ int cache_show(struct kmem_cache *s, struct seq_file *m);
 void print_slabinfo_header(struct seq_file *m);
 
 /**
- * kmalloc_array - allocate memory for an array.
- * @n: number of elements.
- * @size: element size.
+ * kmalloc - allocate memory
+ * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
  *
  * The @flags argument may be one of:
@@ -422,6 +421,17 @@ void print_slabinfo_header(struct seq_file *m);
  * There are other flags available as well, but these are not intended
  * for general use, and so are not documented here. For a full list of
  * potential flags, always refer to linux/gfp.h.
+ *
+ * kmalloc is the normal method of allocating memory
+ * in the kernel.
+ */
+static __always_inline void *kmalloc(size_t size, gfp_t flags);
+
+/**
+ * kmalloc_array - allocate memory for an array.
+ * @n: number of elements.
+ * @size: element size.
+ * @flags: the type of memory to allocate (see kmalloc).
  */
 static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
 {
@@ -445,7 +455,7 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 /**
  * kmalloc_node - allocate memory from a specific node
  * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kcalloc).
+ * @flags: the type of memory to allocate (see kmalloc).
  * @node: node to allocate from.
  *
  * kmalloc() for non-local nodes, used to allocate from a specific node
diff --git a/include/linux/slob_def.h b/include/linux/slob_def.h
index f28e14a12e3f..095a5a4a8516 100644
--- a/include/linux/slob_def.h
+++ b/include/linux/slob_def.h
@@ -18,14 +18,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 	return __kmalloc_node(size, flags, node);
 }
 
-/**
- * kmalloc - allocate memory
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kcalloc).
- *
- * kmalloc is the normal method of allocating memory
- * in the kernel.
- */
 static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
 	return __kmalloc_node(size, flags, NUMA_NO_NODE);
-- 
cgit v1.2.3


From e126ba97dba9edeb6fafa3665b5f8497fc9cdf8c Mon Sep 17 00:00:00 2001
From: Eli Cohen <eli@mellanox.com>
Date: Sun, 7 Jul 2013 17:25:49 +0300
Subject: mlx5: Add driver for Mellanox Connect-IB adapters

The driver is comprised of two kernel modules: mlx5_ib and mlx5_core.
This partitioning resembles what we have for mlx4, except that mlx5_ib
is the pci device driver and not mlx5_core.

mlx5_core is essentially a library that provides general functionality
that is intended to be used by other Mellanox devices that will be
introduced in the future.  mlx5_ib has a similar role as any hardware
device under drivers/infiniband/hw.

Signed-off-by: Eli Cohen <eli@mellanox.com>
Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>

[ Merge in coccinelle fixes from Fengguang Wu <fengguang.wu@intel.com>.
  - Roland ]

Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 MAINTAINERS                                        |   22 +
 drivers/infiniband/Kconfig                         |    1 +
 drivers/infiniband/Makefile                        |    1 +
 drivers/infiniband/hw/mlx5/Kconfig                 |   10 +
 drivers/infiniband/hw/mlx5/Makefile                |    3 +
 drivers/infiniband/hw/mlx5/ah.c                    |   92 +
 drivers/infiniband/hw/mlx5/cq.c                    |  843 +++++++
 drivers/infiniband/hw/mlx5/doorbell.c              |  100 +
 drivers/infiniband/hw/mlx5/mad.c                   |  139 ++
 drivers/infiniband/hw/mlx5/main.c                  | 1504 ++++++++++++
 drivers/infiniband/hw/mlx5/mem.c                   |  162 ++
 drivers/infiniband/hw/mlx5/mlx5_ib.h               |  545 +++++
 drivers/infiniband/hw/mlx5/mr.c                    | 1007 ++++++++
 drivers/infiniband/hw/mlx5/qp.c                    | 2524 ++++++++++++++++++++
 drivers/infiniband/hw/mlx5/srq.c                   |  473 ++++
 drivers/infiniband/hw/mlx5/user.h                  |  121 +
 drivers/net/ethernet/mellanox/Kconfig              |    1 +
 drivers/net/ethernet/mellanox/Makefile             |    1 +
 drivers/net/ethernet/mellanox/mlx5/core/Kconfig    |   18 +
 drivers/net/ethernet/mellanox/mlx5/core/Makefile   |    5 +
 drivers/net/ethernet/mellanox/mlx5/core/alloc.c    |  238 ++
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c      | 1515 ++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/cq.c       |  224 ++
 drivers/net/ethernet/mellanox/mlx5/core/debugfs.c  |  587 +++++
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |  521 ++++
 drivers/net/ethernet/mellanox/mlx5/core/fw.c       |  185 ++
 drivers/net/ethernet/mellanox/mlx5/core/health.c   |  217 ++
 drivers/net/ethernet/mellanox/mlx5/core/mad.c      |   78 +
 drivers/net/ethernet/mellanox/mlx5/core/main.c     |  475 ++++
 drivers/net/ethernet/mellanox/mlx5/core/mcg.c      |  106 +
 .../net/ethernet/mellanox/mlx5/core/mlx5_core.h    |   73 +
 drivers/net/ethernet/mellanox/mlx5/core/mr.c       |  136 ++
 .../net/ethernet/mellanox/mlx5/core/pagealloc.c    |  435 ++++
 drivers/net/ethernet/mellanox/mlx5/core/pd.c       |  101 +
 drivers/net/ethernet/mellanox/mlx5/core/port.c     |  104 +
 drivers/net/ethernet/mellanox/mlx5/core/qp.c       |  301 +++
 drivers/net/ethernet/mellanox/mlx5/core/srq.c      |  223 ++
 drivers/net/ethernet/mellanox/mlx5/core/uar.c      |  223 ++
 include/linux/mlx5/cmd.h                           |   51 +
 include/linux/mlx5/cq.h                            |  165 ++
 include/linux/mlx5/device.h                        |  893 +++++++
 include/linux/mlx5/doorbell.h                      |   79 +
 include/linux/mlx5/driver.h                        |  769 ++++++
 include/linux/mlx5/qp.h                            |  467 ++++
 include/linux/mlx5/srq.h                           |   41 +
 45 files changed, 15779 insertions(+)
 create mode 100644 drivers/infiniband/hw/mlx5/Kconfig
 create mode 100644 drivers/infiniband/hw/mlx5/Makefile
 create mode 100644 drivers/infiniband/hw/mlx5/ah.c
 create mode 100644 drivers/infiniband/hw/mlx5/cq.c
 create mode 100644 drivers/infiniband/hw/mlx5/doorbell.c
 create mode 100644 drivers/infiniband/hw/mlx5/mad.c
 create mode 100644 drivers/infiniband/hw/mlx5/main.c
 create mode 100644 drivers/infiniband/hw/mlx5/mem.c
 create mode 100644 drivers/infiniband/hw/mlx5/mlx5_ib.h
 create mode 100644 drivers/infiniband/hw/mlx5/mr.c
 create mode 100644 drivers/infiniband/hw/mlx5/qp.c
 create mode 100644 drivers/infiniband/hw/mlx5/srq.c
 create mode 100644 drivers/infiniband/hw/mlx5/user.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/Kconfig
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/Makefile
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/alloc.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/cmd.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/cq.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/eq.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/fw.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/health.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/mad.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/main.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/mcg.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/mr.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/pd.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/port.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/qp.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/srq.c
 create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/uar.c
 create mode 100644 include/linux/mlx5/cmd.h
 create mode 100644 include/linux/mlx5/cq.h
 create mode 100644 include/linux/mlx5/device.h
 create mode 100644 include/linux/mlx5/doorbell.h
 create mode 100644 include/linux/mlx5/driver.h
 create mode 100644 include/linux/mlx5/qp.h
 create mode 100644 include/linux/mlx5/srq.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 60d6a3393500..b4265364ed27 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5365,6 +5365,28 @@ W:	http://linuxtv.org
 S:	Odd Fixes
 F:	drivers/media/radio/radio-miropcm20*
 
+Mellanox MLX5 core VPI driver
+M:	Eli Cohen <eli@mellanox.com>
+L:	netdev@vger.kernel.org
+L:	linux-rdma@vger.kernel.org
+W:	http://www.mellanox.com
+Q:	http://patchwork.ozlabs.org/project/netdev/list/
+Q:	http://patchwork.kernel.org/project/linux-rdma/list/
+T:	git://openfabrics.org/~eli/connect-ib.git
+S:	Supported
+F:	drivers/net/ethernet/mellanox/mlx5/core/
+F:	include/linux/mlx5/
+
+Mellanox MLX5 IB driver
+M:      Eli Cohen <eli@mellanox.com>
+L:      linux-rdma@vger.kernel.org
+W:      http://www.mellanox.com
+Q:      http://patchwork.kernel.org/project/linux-rdma/list/
+T:      git://openfabrics.org/~eli/connect-ib.git
+S:      Supported
+F:      include/linux/mlx5/
+F:      drivers/infiniband/hw/mlx5/
+
 MODULE SUPPORT
 M:	Rusty Russell <rusty@rustcorp.com.au>
 S:	Maintained
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index c85b56c28099..5ceda710f516 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -50,6 +50,7 @@ source "drivers/infiniband/hw/amso1100/Kconfig"
 source "drivers/infiniband/hw/cxgb3/Kconfig"
 source "drivers/infiniband/hw/cxgb4/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
+source "drivers/infiniband/hw/mlx5/Kconfig"
 source "drivers/infiniband/hw/nes/Kconfig"
 source "drivers/infiniband/hw/ocrdma/Kconfig"
 
diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile
index b126fefe0b1c..1fe69888515f 100644
--- a/drivers/infiniband/Makefile
+++ b/drivers/infiniband/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_INFINIBAND_AMSO1100)	+= hw/amso1100/
 obj-$(CONFIG_INFINIBAND_CXGB3)		+= hw/cxgb3/
 obj-$(CONFIG_INFINIBAND_CXGB4)		+= hw/cxgb4/
 obj-$(CONFIG_MLX4_INFINIBAND)		+= hw/mlx4/
+obj-$(CONFIG_MLX5_INFINIBAND)		+= hw/mlx5/
 obj-$(CONFIG_INFINIBAND_NES)		+= hw/nes/
 obj-$(CONFIG_INFINIBAND_OCRDMA)		+= hw/ocrdma/
 obj-$(CONFIG_INFINIBAND_IPOIB)		+= ulp/ipoib/
diff --git a/drivers/infiniband/hw/mlx5/Kconfig b/drivers/infiniband/hw/mlx5/Kconfig
new file mode 100644
index 000000000000..8e6aebfaf8a4
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/Kconfig
@@ -0,0 +1,10 @@
+config MLX5_INFINIBAND
+	tristate "Mellanox Connect-IB HCA support"
+	depends on NETDEVICES && ETHERNET && PCI && X86
+	select NET_VENDOR_MELLANOX
+	select MLX5_CORE
+	---help---
+	  This driver provides low-level InfiniBand support for
+	  Mellanox Connect-IB PCI Express host channel adapters (HCAs).
+	  This is required to use InfiniBand protocols such as
+	  IP-over-IB or SRP with these devices.
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
new file mode 100644
index 000000000000..4ea0135af484
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_MLX5_INFINIBAND)	+= mlx5_ib.o
+
+mlx5_ib-y :=	main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o
diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c
new file mode 100644
index 000000000000..39ab0caefdf9
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/ah.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx5_ib.h"
+
+struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
+			   struct mlx5_ib_ah *ah)
+{
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16);
+		ah->av.grh_gid_fl = cpu_to_be32(ah_attr->grh.flow_label |
+						(1 << 30) |
+						ah_attr->grh.sgid_index << 20);
+		ah->av.hop_limit = ah_attr->grh.hop_limit;
+		ah->av.tclass = ah_attr->grh.traffic_class;
+	}
+
+	ah->av.rlid = cpu_to_be16(ah_attr->dlid);
+	ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f;
+	ah->av.stat_rate_sl = (ah_attr->static_rate << 4) | (ah_attr->sl & 0xf);
+
+	return &ah->ibah;
+}
+
+struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	struct mlx5_ib_ah *ah;
+
+	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	return create_ib_ah(ah_attr, ah); /* never fails */
+}
+
+int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct mlx5_ib_ah *ah = to_mah(ibah);
+	u32 tmp;
+
+	memset(ah_attr, 0, sizeof(*ah_attr));
+
+	tmp = be32_to_cpu(ah->av.grh_gid_fl);
+	if (tmp & (1 << 30)) {
+		ah_attr->ah_flags = IB_AH_GRH;
+		ah_attr->grh.sgid_index = (tmp >> 20) & 0xff;
+		ah_attr->grh.flow_label = tmp & 0xfffff;
+		memcpy(&ah_attr->grh.dgid, ah->av.rgid, 16);
+		ah_attr->grh.hop_limit = ah->av.hop_limit;
+		ah_attr->grh.traffic_class = ah->av.tclass;
+	}
+	ah_attr->dlid = be16_to_cpu(ah->av.rlid);
+	ah_attr->static_rate = ah->av.stat_rate_sl >> 4;
+	ah_attr->sl = ah->av.stat_rate_sl & 0xf;
+
+	return 0;
+}
+
+int mlx5_ib_destroy_ah(struct ib_ah *ah)
+{
+	kfree(to_mah(ah));
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
new file mode 100644
index 000000000000..344ab03948a3
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -0,0 +1,843 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kref.h>
+#include <rdma/ib_umem.h>
+#include "mlx5_ib.h"
+#include "user.h"
+
+static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq)
+{
+	struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
+
+	ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void mlx5_ib_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
+{
+	struct mlx5_ib_cq *cq = container_of(mcq, struct mlx5_ib_cq, mcq);
+	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+	struct ib_cq *ibcq = &cq->ibcq;
+	struct ib_event event;
+
+	if (type != MLX5_EVENT_TYPE_CQ_ERROR) {
+		mlx5_ib_warn(dev, "Unexpected event type %d on CQ %06x\n",
+			     type, mcq->cqn);
+		return;
+	}
+
+	if (ibcq->event_handler) {
+		event.device     = &dev->ib_dev;
+		event.event      = IB_EVENT_CQ_ERR;
+		event.element.cq = ibcq;
+		ibcq->event_handler(&event, ibcq->cq_context);
+	}
+}
+
+static void *get_cqe_from_buf(struct mlx5_ib_cq_buf *buf, int n, int size)
+{
+	return mlx5_buf_offset(&buf->buf, n * size);
+}
+
+static void *get_cqe(struct mlx5_ib_cq *cq, int n)
+{
+	return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz);
+}
+
+static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n)
+{
+	void *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+	struct mlx5_cqe64 *cqe64;
+
+	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+	return ((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^
+		!!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
+}
+
+static void *next_cqe_sw(struct mlx5_ib_cq *cq)
+{
+	return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx)
+{
+	switch (wq->wr_data[idx]) {
+	case MLX5_IB_WR_UMR:
+		return 0;
+
+	case IB_WR_LOCAL_INV:
+		return IB_WC_LOCAL_INV;
+
+	case IB_WR_FAST_REG_MR:
+		return IB_WC_FAST_REG_MR;
+
+	default:
+		pr_warn("unknown completion status\n");
+		return 0;
+	}
+}
+
+static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
+			    struct mlx5_ib_wq *wq, int idx)
+{
+	wc->wc_flags = 0;
+	switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) {
+	case MLX5_OPCODE_RDMA_WRITE_IMM:
+		wc->wc_flags |= IB_WC_WITH_IMM;
+	case MLX5_OPCODE_RDMA_WRITE:
+		wc->opcode    = IB_WC_RDMA_WRITE;
+		break;
+	case MLX5_OPCODE_SEND_IMM:
+		wc->wc_flags |= IB_WC_WITH_IMM;
+	case MLX5_OPCODE_SEND:
+	case MLX5_OPCODE_SEND_INVAL:
+		wc->opcode    = IB_WC_SEND;
+		break;
+	case MLX5_OPCODE_RDMA_READ:
+		wc->opcode    = IB_WC_RDMA_READ;
+		wc->byte_len  = be32_to_cpu(cqe->byte_cnt);
+		break;
+	case MLX5_OPCODE_ATOMIC_CS:
+		wc->opcode    = IB_WC_COMP_SWAP;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_ATOMIC_FA:
+		wc->opcode    = IB_WC_FETCH_ADD;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_ATOMIC_MASKED_CS:
+		wc->opcode    = IB_WC_MASKED_COMP_SWAP;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_ATOMIC_MASKED_FA:
+		wc->opcode    = IB_WC_MASKED_FETCH_ADD;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_BIND_MW:
+		wc->opcode    = IB_WC_BIND_MW;
+		break;
+	case MLX5_OPCODE_UMR:
+		wc->opcode = get_umr_comp(wq, idx);
+		break;
+	}
+}
+
+enum {
+	MLX5_GRH_IN_BUFFER = 1,
+	MLX5_GRH_IN_CQE	   = 2,
+};
+
+static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
+			     struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
+	struct mlx5_ib_srq *srq;
+	struct mlx5_ib_wq *wq;
+	u16 wqe_ctr;
+	u8 g;
+
+	if (qp->ibqp.srq || qp->ibqp.xrcd) {
+		struct mlx5_core_srq *msrq = NULL;
+
+		if (qp->ibqp.xrcd) {
+			msrq = mlx5_core_get_srq(&dev->mdev,
+						 be32_to_cpu(cqe->srqn));
+			srq = to_mibsrq(msrq);
+		} else {
+			srq = to_msrq(qp->ibqp.srq);
+		}
+		if (srq) {
+			wqe_ctr = be16_to_cpu(cqe->wqe_counter);
+			wc->wr_id = srq->wrid[wqe_ctr];
+			mlx5_ib_free_srq_wqe(srq, wqe_ctr);
+			if (msrq && atomic_dec_and_test(&msrq->refcount))
+				complete(&msrq->free);
+		}
+	} else {
+		wq	  = &qp->rq;
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	}
+	wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+
+	switch (cqe->op_own >> 4) {
+	case MLX5_CQE_RESP_WR_IMM:
+		wc->opcode	= IB_WC_RECV_RDMA_WITH_IMM;
+		wc->wc_flags	= IB_WC_WITH_IMM;
+		wc->ex.imm_data = cqe->imm_inval_pkey;
+		break;
+	case MLX5_CQE_RESP_SEND:
+		wc->opcode   = IB_WC_RECV;
+		wc->wc_flags = 0;
+		break;
+	case MLX5_CQE_RESP_SEND_IMM:
+		wc->opcode	= IB_WC_RECV;
+		wc->wc_flags	= IB_WC_WITH_IMM;
+		wc->ex.imm_data = cqe->imm_inval_pkey;
+		break;
+	case MLX5_CQE_RESP_SEND_INV:
+		wc->opcode	= IB_WC_RECV;
+		wc->wc_flags	= IB_WC_WITH_INVALIDATE;
+		wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey);
+		break;
+	}
+	wc->slid	   = be16_to_cpu(cqe->slid);
+	wc->sl		   = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0xf;
+	wc->src_qp	   = be32_to_cpu(cqe->flags_rqpn) & 0xffffff;
+	wc->dlid_path_bits = cqe->ml_path;
+	g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3;
+	wc->wc_flags |= g ? IB_WC_GRH : 0;
+	wc->pkey_index     = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff;
+}
+
+static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
+{
+	__be32 *p = (__be32 *)cqe;
+	int i;
+
+	mlx5_ib_warn(dev, "dump error cqe\n");
+	for (i = 0; i < sizeof(*cqe) / 16; i++, p += 4)
+		pr_info("%08x %08x %08x %08x\n", be32_to_cpu(p[0]),
+			be32_to_cpu(p[1]), be32_to_cpu(p[2]),
+			be32_to_cpu(p[3]));
+}
+
+static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev,
+				  struct mlx5_err_cqe *cqe,
+				  struct ib_wc *wc)
+{
+	int dump = 1;
+
+	switch (cqe->syndrome) {
+	case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+		wc->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+		wc->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR:
+		wc->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_WR_FLUSH_ERR:
+		dump = 0;
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_MW_BIND_ERR:
+		wc->status = IB_WC_MW_BIND_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_BAD_RESP_ERR:
+		wc->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+		wc->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+		wc->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+		wc->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_OP_ERR:
+		wc->status = IB_WC_REM_OP_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+		wc->status = IB_WC_RETRY_EXC_ERR;
+		dump = 0;
+		break;
+	case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+		wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+		dump = 0;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+		wc->status = IB_WC_REM_ABORT_ERR;
+		break;
+	default:
+		wc->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	wc->vendor_err = cqe->vendor_err_synd;
+	if (dump)
+		dump_cqe(dev, cqe);
+}
+
+static int is_atomic_response(struct mlx5_ib_qp *qp, uint16_t idx)
+{
+	/* TBD: waiting decision
+	*/
+	return 0;
+}
+
+static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, uint16_t idx)
+{
+	struct mlx5_wqe_data_seg *dpseg;
+	void *addr;
+
+	dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) +
+		sizeof(struct mlx5_wqe_raddr_seg) +
+		sizeof(struct mlx5_wqe_atomic_seg);
+	addr = (void *)(unsigned long)be64_to_cpu(dpseg->addr);
+	return addr;
+}
+
+static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
+			  uint16_t idx)
+{
+	void *addr;
+	int byte_count;
+	int i;
+
+	if (!is_atomic_response(qp, idx))
+		return;
+
+	byte_count = be32_to_cpu(cqe64->byte_cnt);
+	addr = mlx5_get_atomic_laddr(qp, idx);
+
+	if (byte_count == 4) {
+		*(uint32_t *)addr = be32_to_cpu(*((__be32 *)addr));
+	} else {
+		for (i = 0; i < byte_count; i += 8) {
+			*(uint64_t *)addr = be64_to_cpu(*((__be64 *)addr));
+			addr += 8;
+		}
+	}
+
+	return;
+}
+
+static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
+			   u16 tail, u16 head)
+{
+	int idx;
+
+	do {
+		idx = tail & (qp->sq.wqe_cnt - 1);
+		handle_atomic(qp, cqe64, idx);
+		if (idx == head)
+			break;
+
+		tail = qp->sq.w_list[idx].next;
+	} while (1);
+	tail = qp->sq.w_list[idx].next;
+	qp->sq.last_poll = tail;
+}
+
+static int mlx5_poll_one(struct mlx5_ib_cq *cq,
+			 struct mlx5_ib_qp **cur_qp,
+			 struct ib_wc *wc)
+{
+	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+	struct mlx5_err_cqe *err_cqe;
+	struct mlx5_cqe64 *cqe64;
+	struct mlx5_core_qp *mqp;
+	struct mlx5_ib_wq *wq;
+	uint8_t opcode;
+	uint32_t qpn;
+	u16 wqe_ctr;
+	void *cqe;
+	int idx;
+
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+
+	++cq->mcq.cons_index;
+
+	/* Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	/* TBD: resize CQ */
+
+	qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff;
+	if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) {
+		/* We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		mqp = __mlx5_qp_lookup(&dev->mdev, qpn);
+		if (unlikely(!mqp)) {
+			mlx5_ib_warn(dev, "CQE@CQ %06x for unknown QPN %6x\n",
+				     cq->mcq.cqn, qpn);
+			return -EINVAL;
+		}
+
+		*cur_qp = to_mibqp(mqp);
+	}
+
+	wc->qp  = &(*cur_qp)->ibqp;
+	opcode = cqe64->op_own >> 4;
+	switch (opcode) {
+	case MLX5_CQE_REQ:
+		wq = &(*cur_qp)->sq;
+		wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+		idx = wqe_ctr & (wq->wqe_cnt - 1);
+		handle_good_req(wc, cqe64, wq, idx);
+		handle_atomics(*cur_qp, cqe64, wq->last_poll, idx);
+		wc->wr_id = wq->wrid[idx];
+		wq->tail = wq->wqe_head[idx] + 1;
+		wc->status = IB_WC_SUCCESS;
+		break;
+	case MLX5_CQE_RESP_WR_IMM:
+	case MLX5_CQE_RESP_SEND:
+	case MLX5_CQE_RESP_SEND_IMM:
+	case MLX5_CQE_RESP_SEND_INV:
+		handle_responder(wc, cqe64, *cur_qp);
+		wc->status = IB_WC_SUCCESS;
+		break;
+	case MLX5_CQE_RESIZE_CQ:
+		break;
+	case MLX5_CQE_REQ_ERR:
+	case MLX5_CQE_RESP_ERR:
+		err_cqe = (struct mlx5_err_cqe *)cqe64;
+		mlx5_handle_error_cqe(dev, err_cqe, wc);
+		mlx5_ib_dbg(dev, "%s error cqe on cqn 0x%x:\n",
+			    opcode == MLX5_CQE_REQ_ERR ?
+			    "Requestor" : "Responder", cq->mcq.cqn);
+		mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n",
+			    err_cqe->syndrome, err_cqe->vendor_err_synd);
+		if (opcode == MLX5_CQE_REQ_ERR) {
+			wq = &(*cur_qp)->sq;
+			wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+			idx = wqe_ctr & (wq->wqe_cnt - 1);
+			wc->wr_id = wq->wrid[idx];
+			wq->tail = wq->wqe_head[idx] + 1;
+		} else {
+			struct mlx5_ib_srq *srq;
+
+			if ((*cur_qp)->ibqp.srq) {
+				srq = to_msrq((*cur_qp)->ibqp.srq);
+				wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+				wc->wr_id = srq->wrid[wqe_ctr];
+				mlx5_ib_free_srq_wqe(srq, wqe_ctr);
+			} else {
+				wq = &(*cur_qp)->rq;
+				wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+				++wq->tail;
+			}
+		}
+		break;
+	}
+
+	return 0;
+}
+
+int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct mlx5_ib_cq *cq = to_mcq(ibcq);
+	struct mlx5_ib_qp *cur_qp = NULL;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (npolled = 0; npolled < num_entries; npolled++) {
+		err = mlx5_poll_one(cq, &cur_qp, wc + npolled);
+		if (err)
+			break;
+	}
+
+	if (npolled)
+		mlx5_cq_set_ci(&cq->mcq);
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	if (err == 0 || err == -EAGAIN)
+		return npolled;
+	else
+		return err;
+}
+
+int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	mlx5_cq_arm(&to_mcq(ibcq)->mcq,
+		    (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT,
+		    to_mdev(ibcq->device)->mdev.priv.uuari.uars[0].map,
+		    MLX5_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->mdev.priv.cq_uar_lock));
+
+	return 0;
+}
+
+static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf,
+			int nent, int cqe_size)
+{
+	int err;
+
+	err = mlx5_buf_alloc(&dev->mdev, nent * cqe_size,
+			     PAGE_SIZE * 2, &buf->buf);
+	if (err)
+		return err;
+
+	buf->cqe_size = cqe_size;
+
+	return 0;
+}
+
+static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf)
+{
+	mlx5_buf_free(&dev->mdev, &buf->buf);
+}
+
+static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
+			  struct ib_ucontext *context, struct mlx5_ib_cq *cq,
+			  int entries, struct mlx5_create_cq_mbox_in **cqb,
+			  int *cqe_size, int *index, int *inlen)
+{
+	struct mlx5_ib_create_cq ucmd;
+	int page_shift;
+	int npages;
+	int ncont;
+	int err;
+
+	if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
+		return -EFAULT;
+
+	if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128)
+		return -EINVAL;
+
+	*cqe_size = ucmd.cqe_size;
+
+	cq->buf.umem = ib_umem_get(context, ucmd.buf_addr,
+				   entries * ucmd.cqe_size,
+				   IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(cq->buf.umem)) {
+		err = PTR_ERR(cq->buf.umem);
+		return err;
+	}
+
+	err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+				  &cq->db);
+	if (err)
+		goto err_umem;
+
+	mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, &npages, &page_shift,
+			   &ncont, NULL);
+	mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n",
+		    ucmd.buf_addr, entries * ucmd.cqe_size, npages, page_shift, ncont);
+
+	*inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * ncont;
+	*cqb = mlx5_vzalloc(*inlen);
+	if (!*cqb) {
+		err = -ENOMEM;
+		goto err_db;
+	}
+	mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0);
+	(*cqb)->ctx.log_pg_sz = page_shift - PAGE_SHIFT;
+
+	*index = to_mucontext(context)->uuari.uars[0].index;
+
+	return 0;
+
+err_db:
+	mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
+
+err_umem:
+	ib_umem_release(cq->buf.umem);
+	return err;
+}
+
+static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context)
+{
+	mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
+	ib_umem_release(cq->buf.umem);
+}
+
+static void init_cq_buf(struct mlx5_ib_cq *cq, int nent)
+{
+	int i;
+	void *cqe;
+	struct mlx5_cqe64 *cqe64;
+
+	for (i = 0; i < nent; i++) {
+		cqe = get_cqe(cq, i);
+		cqe64 = (cq->buf.cqe_size == 64) ? cqe : cqe + 64;
+		cqe64->op_own = 0xf1;
+	}
+}
+
+static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
+			    int entries, int cqe_size,
+			    struct mlx5_create_cq_mbox_in **cqb,
+			    int *index, int *inlen)
+{
+	int err;
+
+	err = mlx5_db_alloc(&dev->mdev, &cq->db);
+	if (err)
+		return err;
+
+	cq->mcq.set_ci_db  = cq->db.db;
+	cq->mcq.arm_db     = cq->db.db + 1;
+	*cq->mcq.set_ci_db = 0;
+	*cq->mcq.arm_db    = 0;
+	cq->mcq.cqe_sz = cqe_size;
+
+	err = alloc_cq_buf(dev, &cq->buf, entries, cqe_size);
+	if (err)
+		goto err_db;
+
+	init_cq_buf(cq, entries);
+
+	*inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages;
+	*cqb = mlx5_vzalloc(*inlen);
+	if (!*cqb) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+	mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas);
+
+	(*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - PAGE_SHIFT;
+	*index = dev->mdev.priv.uuari.uars[0].index;
+
+	return 0;
+
+err_buf:
+	free_cq_buf(dev, &cq->buf);
+
+err_db:
+	mlx5_db_free(&dev->mdev, &cq->db);
+	return err;
+}
+
+static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq)
+{
+	free_cq_buf(dev, &cq->buf);
+	mlx5_db_free(&dev->mdev, &cq->db);
+}
+
+struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,
+				int vector, struct ib_ucontext *context,
+				struct ib_udata *udata)
+{
+	struct mlx5_create_cq_mbox_in *cqb = NULL;
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_cq *cq;
+	int uninitialized_var(index);
+	int uninitialized_var(inlen);
+	int cqe_size;
+	int irqn;
+	int eqn;
+	int err;
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries < 1 || entries > dev->mdev.caps.max_cqes)
+		return ERR_PTR(-EINVAL);
+
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	cq->ibcq.cqe = entries - 1;
+	mutex_init(&cq->resize_mutex);
+	spin_lock_init(&cq->lock);
+	cq->resize_buf = NULL;
+	cq->resize_umem = NULL;
+
+	if (context) {
+		err = create_cq_user(dev, udata, context, cq, entries,
+				     &cqb, &cqe_size, &index, &inlen);
+		if (err)
+			goto err_create;
+	} else {
+		/* for now choose 64 bytes till we have a proper interface */
+		cqe_size = 64;
+		err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb,
+				       &index, &inlen);
+		if (err)
+			goto err_create;
+	}
+
+	cq->cqe_size = cqe_size;
+	cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5;
+	cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index);
+	err = mlx5_vector2eqn(dev, vector, &eqn, &irqn);
+	if (err)
+		goto err_cqb;
+
+	cqb->ctx.c_eqn = cpu_to_be16(eqn);
+	cqb->ctx.db_record_addr = cpu_to_be64(cq->db.dma);
+
+	err = mlx5_core_create_cq(&dev->mdev, &cq->mcq, cqb, inlen);
+	if (err)
+		goto err_cqb;
+
+	mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn);
+	cq->mcq.irqn = irqn;
+	cq->mcq.comp  = mlx5_ib_cq_comp;
+	cq->mcq.event = mlx5_ib_cq_event;
+
+	if (context)
+		if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) {
+			err = -EFAULT;
+			goto err_cmd;
+		}
+
+
+	mlx5_vfree(cqb);
+	return &cq->ibcq;
+
+err_cmd:
+	mlx5_core_destroy_cq(&dev->mdev, &cq->mcq);
+
+err_cqb:
+	mlx5_vfree(cqb);
+	if (context)
+		destroy_cq_user(cq, context);
+	else
+		destroy_cq_kernel(dev, cq);
+
+err_create:
+	kfree(cq);
+
+	return ERR_PTR(err);
+}
+
+
+int mlx5_ib_destroy_cq(struct ib_cq *cq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(cq->device);
+	struct mlx5_ib_cq *mcq = to_mcq(cq);
+	struct ib_ucontext *context = NULL;
+
+	if (cq->uobject)
+		context = cq->uobject->context;
+
+	mlx5_core_destroy_cq(&dev->mdev, &mcq->mcq);
+	if (context)
+		destroy_cq_user(mcq, context);
+	else
+		destroy_cq_kernel(dev, mcq);
+
+	kfree(mcq);
+
+	return 0;
+}
+
+static int is_equal_rsn(struct mlx5_cqe64 *cqe64, struct mlx5_ib_srq *srq,
+			u32 rsn)
+{
+	u32 lrsn;
+
+	if (srq)
+		lrsn = be32_to_cpu(cqe64->srqn) & 0xffffff;
+	else
+		lrsn = be32_to_cpu(cqe64->sop_drop_qpn) & 0xffffff;
+
+	return rsn == lrsn;
+}
+
+void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq)
+{
+	struct mlx5_cqe64 *cqe64, *dest64;
+	void *cqe, *dest;
+	u32 prod_index;
+	int nfreed = 0;
+	u8 owner_bit;
+
+	if (!cq)
+		return;
+
+	/* First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); prod_index++)
+		if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+			break;
+
+	/* Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+		if (is_equal_rsn(cqe64, srq, rsn)) {
+			if (srq)
+				mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter));
+			++nfreed;
+		} else if (nfreed) {
+			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+			dest64 = (cq->mcq.cqe_sz == 64) ? dest : dest + 64;
+			owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK;
+			memcpy(dest, cqe, cq->mcq.cqe_sz);
+			dest64->op_own = owner_bit |
+				(dest64->op_own & ~MLX5_CQE_OWNER_MASK);
+		}
+	}
+
+	if (nfreed) {
+		cq->mcq.cons_index += nfreed;
+		/* Make sure update of buffer contents is done before
+		 * updating consumer index.
+		 */
+		wmb();
+		mlx5_cq_set_ci(&cq->mcq);
+	}
+}
+
+void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq)
+{
+	if (!cq)
+		return;
+
+	spin_lock_irq(&cq->lock);
+	__mlx5_ib_cq_clean(cq, qpn, srq);
+	spin_unlock_irq(&cq->lock);
+}
+
+int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	return -ENOSYS;
+}
+
+int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+	return -ENOSYS;
+}
+
+int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq)
+{
+	struct mlx5_ib_cq *cq;
+
+	if (!ibcq)
+		return 128;
+
+	cq = to_mcq(ibcq);
+	return cq->cqe_size;
+}
diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c
new file mode 100644
index 000000000000..256a23344f28
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/doorbell.c
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kref.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+
+#include "mlx5_ib.h"
+
+struct mlx5_ib_user_db_page {
+	struct list_head	list;
+	struct ib_umem	       *umem;
+	unsigned long		user_virt;
+	int			refcnt;
+};
+
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+			struct mlx5_db *db)
+{
+	struct mlx5_ib_user_db_page *page;
+	struct ib_umem_chunk *chunk;
+	int err = 0;
+
+	mutex_lock(&context->db_page_mutex);
+
+	list_for_each_entry(page, &context->db_page_list, list)
+		if (page->user_virt == (virt & PAGE_MASK))
+			goto found;
+
+	page = kmalloc(sizeof(*page), GFP_KERNEL);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	page->user_virt = (virt & PAGE_MASK);
+	page->refcnt    = 0;
+	page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
+				      PAGE_SIZE, 0, 0);
+	if (IS_ERR(page->umem)) {
+		err = PTR_ERR(page->umem);
+		kfree(page);
+		goto out;
+	}
+
+	list_add(&page->list, &context->db_page_list);
+
+found:
+	chunk = list_entry(page->umem->chunk_list.next, struct ib_umem_chunk, list);
+	db->dma		= sg_dma_address(chunk->page_list) + (virt & ~PAGE_MASK);
+	db->u.user_page = page;
+	++page->refcnt;
+
+out:
+	mutex_unlock(&context->db_page_mutex);
+
+	return err;
+}
+
+void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db)
+{
+	mutex_lock(&context->db_page_mutex);
+
+	if (!--db->u.user_page->refcnt) {
+		list_del(&db->u.user_page->list);
+		ib_umem_release(db->u.user_page->umem);
+		kfree(db->u.user_page);
+	}
+
+	mutex_unlock(&context->db_page_mutex);
+}
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
new file mode 100644
index 000000000000..5c8938be0e08
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/cmd.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include "mlx5_ib.h"
+
+enum {
+	MLX5_IB_VENDOR_CLASS1 = 0x9,
+	MLX5_IB_VENDOR_CLASS2 = 0xa
+};
+
+int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad)
+{
+	u8 op_modifier = 0;
+
+	/* Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if (ignore_mkey || !in_wc)
+		op_modifier |= 0x1;
+	if (ignore_bkey || !in_wc)
+		op_modifier |= 0x2;
+
+	return mlx5_core_mad_ifc(&dev->mdev, in_mad, response_mad, op_modifier, port);
+}
+
+int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	u16 slid;
+	int err;
+
+	slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0)
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/* Don't process SMInfo queries -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO)
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS1   ||
+		   in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS2   ||
+		   in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else {
+		return IB_MAD_RESULT_SUCCESS;
+	}
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev),
+			   mad_flags & IB_MAD_IGNORE_MKEY,
+			   mad_flags & IB_MAD_IGNORE_BKEY,
+			   port_num, in_wc, in_grh, in_mad, out_mad);
+	if (err)
+		return IB_MAD_RESULT_FAILURE;
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	u16 packet_error;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+
+	packet_error = be16_to_cpu(out_mad->status);
+
+	dev->mdev.caps.ext_port_cap[port - 1] = (!err && !packet_error) ?
+		MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
new file mode 100644
index 000000000000..6b1007f9bc29
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -0,0 +1,1504 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm-generic/kmap_types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/io-mapping.h>
+#include <linux/sched.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include "user.h"
+#include "mlx5_ib.h"
+
+#define DRIVER_NAME "mlx5_ib"
+#define DRIVER_VERSION "1.0"
+#define DRIVER_RELDATE	"June 2013"
+
+MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRIVER_VERSION);
+
+static int prof_sel = 2;
+module_param_named(prof_sel, prof_sel, int, 0444);
+MODULE_PARM_DESC(prof_sel, "profile selector. Valid range 0 - 2");
+
+static char mlx5_version[] =
+	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
+	DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
+
+struct mlx5_profile profile[] = {
+	[0] = {
+		.mask		= 0,
+	},
+	[1] = {
+		.mask		= MLX5_PROF_MASK_QP_SIZE,
+		.log_max_qp	= 12,
+	},
+	[2] = {
+		.mask		= MLX5_PROF_MASK_QP_SIZE |
+				  MLX5_PROF_MASK_MR_CACHE,
+		.log_max_qp	= 17,
+		.mr_cache[0]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[1]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[2]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[3]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[4]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[5]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[6]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[7]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[8]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[9]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[10]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[11]	= {
+			.size	= 500,
+			.limit	= 250
+		},
+		.mr_cache[12]	= {
+			.size	= 64,
+			.limit	= 32
+		},
+		.mr_cache[13]	= {
+			.size	= 32,
+			.limit	= 16
+		},
+		.mr_cache[14]	= {
+			.size	= 16,
+			.limit	= 8
+		},
+		.mr_cache[15]	= {
+			.size	= 8,
+			.limit	= 4
+		},
+	},
+};
+
+int mlx5_vector2eqn(struct mlx5_ib_dev *dev, int vector, int *eqn, int *irqn)
+{
+	struct mlx5_eq_table *table = &dev->mdev.priv.eq_table;
+	struct mlx5_eq *eq, *n;
+	int err = -ENOENT;
+
+	spin_lock(&table->lock);
+	list_for_each_entry_safe(eq, n, &dev->eqs_list, list) {
+		if (eq->index == vector) {
+			*eqn = eq->eqn;
+			*irqn = eq->irqn;
+			err = 0;
+			break;
+		}
+	}
+	spin_unlock(&table->lock);
+
+	return err;
+}
+
+static int alloc_comp_eqs(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->mdev.priv.eq_table;
+	struct mlx5_eq *eq, *n;
+	int ncomp_vec;
+	int nent;
+	int err;
+	int i;
+
+	INIT_LIST_HEAD(&dev->eqs_list);
+	ncomp_vec = table->num_comp_vectors;
+	nent = MLX5_COMP_EQ_SIZE;
+	for (i = 0; i < ncomp_vec; i++) {
+		eq = kzalloc(sizeof(*eq), GFP_KERNEL);
+		if (!eq) {
+			err = -ENOMEM;
+			goto clean;
+		}
+
+		snprintf(eq->name, MLX5_MAX_EQ_NAME, "mlx5_comp%d", i);
+		err = mlx5_create_map_eq(&dev->mdev, eq,
+					 i + MLX5_EQ_VEC_COMP_BASE, nent, 0,
+					 eq->name,
+					 &dev->mdev.priv.uuari.uars[0]);
+		if (err) {
+			kfree(eq);
+			goto clean;
+		}
+		mlx5_ib_dbg(dev, "allocated completion EQN %d\n", eq->eqn);
+		eq->index = i;
+		spin_lock(&table->lock);
+		list_add_tail(&eq->list, &dev->eqs_list);
+		spin_unlock(&table->lock);
+	}
+
+	dev->num_comp_vectors = ncomp_vec;
+	return 0;
+
+clean:
+	spin_lock(&table->lock);
+	list_for_each_entry_safe(eq, n, &dev->eqs_list, list) {
+		list_del(&eq->list);
+		spin_unlock(&table->lock);
+		if (mlx5_destroy_unmap_eq(&dev->mdev, eq))
+			mlx5_ib_warn(dev, "failed to destroy EQ 0x%x\n", eq->eqn);
+		kfree(eq);
+		spin_lock(&table->lock);
+	}
+	spin_unlock(&table->lock);
+	return err;
+}
+
+static void free_comp_eqs(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->mdev.priv.eq_table;
+	struct mlx5_eq *eq, *n;
+
+	spin_lock(&table->lock);
+	list_for_each_entry_safe(eq, n, &dev->eqs_list, list) {
+		list_del(&eq->list);
+		spin_unlock(&table->lock);
+		if (mlx5_destroy_unmap_eq(&dev->mdev, eq))
+			mlx5_ib_warn(dev, "failed to destroy EQ 0x%x\n", eq->eqn);
+		kfree(eq);
+		spin_lock(&table->lock);
+	}
+	spin_unlock(&table->lock);
+}
+
+static int mlx5_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	int max_rq_sg;
+	int max_sq_sg;
+	u64 flags;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memset(props, 0, sizeof(*props));
+
+	props->fw_ver = ((u64)fw_rev_maj(&dev->mdev) << 32) |
+		(fw_rev_min(&dev->mdev) << 16) |
+		fw_rev_sub(&dev->mdev);
+	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT		|
+		IB_DEVICE_SYS_IMAGE_GUID		|
+		IB_DEVICE_RC_RNR_NAK_GEN		|
+		IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+	flags = dev->mdev.caps.flags;
+	if (flags & MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+	if (flags & MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+	if (flags & MLX5_DEV_CAP_FLAG_APM)
+		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+	props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
+	if (flags & MLX5_DEV_CAP_FLAG_XRC)
+		props->device_cap_flags |= IB_DEVICE_XRC;
+	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+
+	props->vendor_id	   = be32_to_cpup((__be32 *)(out_mad->data + 36)) &
+		0xffffff;
+	props->vendor_part_id	   = be16_to_cpup((__be16 *)(out_mad->data + 30));
+	props->hw_ver		   = be32_to_cpup((__be32 *)(out_mad->data + 32));
+	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
+
+	props->max_mr_size	   = ~0ull;
+	props->page_size_cap	   = dev->mdev.caps.min_page_sz;
+	props->max_qp		   = 1 << dev->mdev.caps.log_max_qp;
+	props->max_qp_wr	   = dev->mdev.caps.max_wqes;
+	max_rq_sg = dev->mdev.caps.max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg);
+	max_sq_sg = (dev->mdev.caps.max_sq_desc_sz - sizeof(struct mlx5_wqe_ctrl_seg)) /
+		sizeof(struct mlx5_wqe_data_seg);
+	props->max_sge = min(max_rq_sg, max_sq_sg);
+	props->max_cq		   = 1 << dev->mdev.caps.log_max_cq;
+	props->max_cqe		   = dev->mdev.caps.max_cqes - 1;
+	props->max_mr		   = 1 << dev->mdev.caps.log_max_mkey;
+	props->max_pd		   = 1 << dev->mdev.caps.log_max_pd;
+	props->max_qp_rd_atom	   = dev->mdev.caps.max_ra_req_qp;
+	props->max_qp_init_rd_atom = dev->mdev.caps.max_ra_res_qp;
+	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
+	props->max_srq		   = 1 << dev->mdev.caps.log_max_srq;
+	props->max_srq_wr	   = dev->mdev.caps.max_srq_wqes - 1;
+	props->max_srq_sge	   = max_rq_sg - 1;
+	props->max_fast_reg_page_list_len = (unsigned int)-1;
+	props->local_ca_ack_delay  = dev->mdev.caps.local_ca_ack_delay;
+	props->atomic_cap	   = dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_ATOMIC ?
+		IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+	props->masked_atomic_cap   = IB_ATOMIC_HCA;
+	props->max_pkeys	   = be16_to_cpup((__be16 *)(out_mad->data + 28));
+	props->max_mcast_grp	   = 1 << dev->mdev.caps.log_max_mcg;
+	props->max_mcast_qp_attach = dev->mdev.caps.max_qp_mcg;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+					   props->max_mcast_grp;
+	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
+
+int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
+		       struct ib_port_attr *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int ext_active_speed;
+	int err = -ENOMEM;
+
+	if (port < 1 || port > dev->mdev.caps.num_ports) {
+		mlx5_ib_warn(dev, "invalid port number %d\n", port);
+		return -EINVAL;
+	}
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(props, 0, sizeof(*props));
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx5_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err) {
+		mlx5_ib_warn(dev, "err %d\n", err);
+		goto out;
+	}
+
+
+	props->lid		= be16_to_cpup((__be16 *)(out_mad->data + 16));
+	props->lmc		= out_mad->data[34] & 0x7;
+	props->sm_lid		= be16_to_cpup((__be16 *)(out_mad->data + 18));
+	props->sm_sl		= out_mad->data[36] & 0xf;
+	props->state		= out_mad->data[32] & 0xf;
+	props->phys_state	= out_mad->data[33] >> 4;
+	props->port_cap_flags	= be32_to_cpup((__be32 *)(out_mad->data + 20));
+	props->gid_tbl_len	= out_mad->data[50];
+	props->max_msg_sz	= 1 << to_mdev(ibdev)->mdev.caps.log_max_msg;
+	props->pkey_tbl_len	= to_mdev(ibdev)->mdev.caps.port[port - 1].pkey_table_len;
+	props->bad_pkey_cntr	= be16_to_cpup((__be16 *)(out_mad->data + 46));
+	props->qkey_viol_cntr	= be16_to_cpup((__be16 *)(out_mad->data + 48));
+	props->active_width	= out_mad->data[31] & 0xf;
+	props->active_speed	= out_mad->data[35] >> 4;
+	props->max_mtu		= out_mad->data[41] & 0xf;
+	props->active_mtu	= out_mad->data[36] >> 4;
+	props->subnet_timeout	= out_mad->data[51] & 0x1f;
+	props->max_vl_num	= out_mad->data[37] >> 4;
+	props->init_type_reply	= out_mad->data[41] >> 4;
+
+	/* Check if extended speeds (EDR/FDR/...) are supported */
+	if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) {
+		ext_active_speed = out_mad->data[62] >> 4;
+
+		switch (ext_active_speed) {
+		case 1:
+			props->active_speed = 16; /* FDR */
+			break;
+		case 2:
+			props->active_speed = 32; /* EDR */
+			break;
+		}
+	}
+
+	/* If reported active speed is QDR, check if is FDR-10 */
+	if (props->active_speed == 4) {
+		if (dev->mdev.caps.ext_port_cap[port - 1] &
+		    MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) {
+			init_query_mad(in_mad);
+			in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO;
+			in_mad->attr_mod = cpu_to_be32(port);
+
+			err = mlx5_MAD_IFC(dev, 1, 1, port,
+					   NULL, NULL, in_mad, out_mad);
+			if (err)
+				goto out;
+
+			/* Checking LinkSpeedActive for FDR-10 */
+			if (out_mad->data[15] & 0x1)
+				props->active_speed = 8;
+		}
+	}
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
+
+static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			     union ib_gid *gid)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod = cpu_to_be32(index / 8);
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			      u16 *pkey)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	*pkey = be16_to_cpu(((__be16 *)out_mad->data)[index % 32]);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+struct mlx5_reg_node_desc {
+	u8	desc[64];
+};
+
+static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
+				 struct ib_device_modify *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_reg_node_desc in;
+	struct mlx5_reg_node_desc out;
+	int err;
+
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
+		return 0;
+
+	/*
+	 * If possible, pass node desc to FW, so it can generate
+	 * a 144 trap.  If cmd fails, just ignore.
+	 */
+	memcpy(&in, props->node_desc, 64);
+	err = mlx5_core_access_reg(&dev->mdev, &in, sizeof(in), &out,
+				   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
+	if (err)
+		return err;
+
+	memcpy(ibdev->node_desc, props->node_desc, 64);
+
+	return err;
+}
+
+static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+			       struct ib_port_modify *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct ib_port_attr attr;
+	u32 tmp;
+	int err;
+
+	mutex_lock(&dev->cap_mask_mutex);
+
+	err = mlx5_ib_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mlx5_set_port_caps(&dev->mdev, port, tmp);
+
+out:
+	mutex_unlock(&dev->cap_mask_mutex);
+	return err;
+}
+
+static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
+						  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_alloc_ucontext_req req;
+	struct mlx5_ib_alloc_ucontext_resp resp;
+	struct mlx5_ib_ucontext *context;
+	struct mlx5_uuar_info *uuari;
+	struct mlx5_uar *uars;
+	int num_uars;
+	int uuarn;
+	int err;
+	int i;
+
+	if (!dev->ib_active)
+		return ERR_PTR(-EAGAIN);
+
+	err = ib_copy_from_udata(&req, udata, sizeof(req));
+	if (err)
+		return ERR_PTR(err);
+
+	if (req.total_num_uuars > MLX5_MAX_UUARS)
+		return ERR_PTR(-ENOMEM);
+
+	if (req.total_num_uuars == 0)
+		return ERR_PTR(-EINVAL);
+
+	req.total_num_uuars = ALIGN(req.total_num_uuars, MLX5_BF_REGS_PER_PAGE);
+	if (req.num_low_latency_uuars > req.total_num_uuars - 1)
+		return ERR_PTR(-EINVAL);
+
+	num_uars = req.total_num_uuars / MLX5_BF_REGS_PER_PAGE;
+	resp.qp_tab_size      = 1 << dev->mdev.caps.log_max_qp;
+	resp.bf_reg_size      = dev->mdev.caps.bf_reg_size;
+	resp.cache_line_size  = L1_CACHE_BYTES;
+	resp.max_sq_desc_sz = dev->mdev.caps.max_sq_desc_sz;
+	resp.max_rq_desc_sz = dev->mdev.caps.max_rq_desc_sz;
+	resp.max_send_wqebb = dev->mdev.caps.max_wqes;
+	resp.max_recv_wr = dev->mdev.caps.max_wqes;
+	resp.max_srq_recv_wr = dev->mdev.caps.max_srq_wqes;
+
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	uuari = &context->uuari;
+	mutex_init(&uuari->lock);
+	uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL);
+	if (!uars) {
+		err = -ENOMEM;
+		goto out_ctx;
+	}
+
+	uuari->bitmap = kcalloc(BITS_TO_LONGS(req.total_num_uuars),
+				sizeof(*uuari->bitmap),
+				GFP_KERNEL);
+	if (!uuari->bitmap) {
+		err = -ENOMEM;
+		goto out_uar_ctx;
+	}
+	/*
+	 * clear all fast path uuars
+	 */
+	for (i = 0; i < req.total_num_uuars; i++) {
+		uuarn = i & 3;
+		if (uuarn == 2 || uuarn == 3)
+			set_bit(i, uuari->bitmap);
+	}
+
+	uuari->count = kcalloc(req.total_num_uuars, sizeof(*uuari->count), GFP_KERNEL);
+	if (!uuari->count) {
+		err = -ENOMEM;
+		goto out_bitmap;
+	}
+
+	for (i = 0; i < num_uars; i++) {
+		err = mlx5_cmd_alloc_uar(&dev->mdev, &uars[i].index);
+		if (err)
+			goto out_count;
+	}
+
+	INIT_LIST_HEAD(&context->db_page_list);
+	mutex_init(&context->db_page_mutex);
+
+	resp.tot_uuars = req.total_num_uuars;
+	resp.num_ports = dev->mdev.caps.num_ports;
+	err = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (err)
+		goto out_uars;
+
+	uuari->num_low_latency_uuars = req.num_low_latency_uuars;
+	uuari->uars = uars;
+	uuari->num_uars = num_uars;
+	return &context->ibucontext;
+
+out_uars:
+	for (i--; i >= 0; i--)
+		mlx5_cmd_free_uar(&dev->mdev, uars[i].index);
+out_count:
+	kfree(uuari->count);
+
+out_bitmap:
+	kfree(uuari->bitmap);
+
+out_uar_ctx:
+	kfree(uars);
+
+out_ctx:
+	kfree(context);
+	return ERR_PTR(err);
+}
+
+static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
+	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
+	struct mlx5_uuar_info *uuari = &context->uuari;
+	int i;
+
+	for (i = 0; i < uuari->num_uars; i++) {
+		if (mlx5_cmd_free_uar(&dev->mdev, uuari->uars[i].index))
+			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
+	}
+
+	kfree(uuari->count);
+	kfree(uuari->bitmap);
+	kfree(uuari->uars);
+	kfree(context);
+
+	return 0;
+}
+
+static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index)
+{
+	return (pci_resource_start(dev->mdev.pdev, 0) >> PAGE_SHIFT) + index;
+}
+
+static int get_command(unsigned long offset)
+{
+	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
+}
+
+static int get_arg(unsigned long offset)
+{
+	return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
+}
+
+static int get_index(unsigned long offset)
+{
+	return get_arg(offset);
+}
+
+static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
+{
+	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
+	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
+	struct mlx5_uuar_info *uuari = &context->uuari;
+	unsigned long command;
+	unsigned long idx;
+	phys_addr_t pfn;
+
+	command = get_command(vma->vm_pgoff);
+	switch (command) {
+	case MLX5_IB_MMAP_REGULAR_PAGE:
+		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+			return -EINVAL;
+
+		idx = get_index(vma->vm_pgoff);
+		pfn = uar_index2pfn(dev, uuari->uars[idx].index);
+		mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx,
+			    (unsigned long long)pfn);
+
+		if (idx >= uuari->num_uars)
+			return -EINVAL;
+
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+		if (io_remap_pfn_range(vma, vma->vm_start, pfn,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+
+		mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n",
+			    vma->vm_start,
+			    (unsigned long long)pfn << PAGE_SHIFT);
+		break;
+
+	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
+		return -ENOSYS;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn)
+{
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_mkey_seg *seg;
+	struct mlx5_core_mr mr;
+	int err;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	seg = &in->seg;
+	seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA;
+	seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64);
+	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	seg->start_addr = 0;
+
+	err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in));
+	if (err) {
+		mlx5_ib_warn(dev, "failed to create mkey, %d\n", err);
+		goto err_in;
+	}
+
+	kfree(in);
+	*key = mr.key;
+
+	return 0;
+
+err_in:
+	kfree(in);
+
+	return err;
+}
+
+static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key)
+{
+	struct mlx5_core_mr mr;
+	int err;
+
+	memset(&mr, 0, sizeof(mr));
+	mr.key = key;
+	err = mlx5_core_destroy_mkey(&dev->mdev, &mr);
+	if (err)
+		mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key);
+}
+
+static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
+				      struct ib_ucontext *context,
+				      struct ib_udata *udata)
+{
+	struct mlx5_ib_alloc_pd_resp resp;
+	struct mlx5_ib_pd *pd;
+	int err;
+
+	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_core_alloc_pd(&to_mdev(ibdev)->mdev, &pd->pdn);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context) {
+		resp.pdn = pd->pdn;
+		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
+			mlx5_core_dealloc_pd(&to_mdev(ibdev)->mdev, pd->pdn);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	} else {
+		err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn);
+		if (err) {
+			mlx5_core_dealloc_pd(&to_mdev(ibdev)->mdev, pd->pdn);
+			kfree(pd);
+			return ERR_PTR(err);
+		}
+	}
+
+	return &pd->ibpd;
+}
+
+static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
+{
+	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
+	struct mlx5_ib_pd *mpd = to_mpd(pd);
+
+	if (!pd->uobject)
+		free_pa_mkey(mdev, mpd->pa_lkey);
+
+	mlx5_core_dealloc_pd(&mdev->mdev, mpd->pdn);
+	kfree(mpd);
+
+	return 0;
+}
+
+static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	int err;
+
+	err = mlx5_core_attach_mcg(&dev->mdev, gid, ibqp->qp_num);
+	if (err)
+		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
+			     ibqp->qp_num, gid->raw);
+
+	return err;
+}
+
+static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	int err;
+
+	err = mlx5_core_detach_mcg(&dev->mdev, gid, ibqp->qp_num);
+	if (err)
+		mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
+			     ibqp->qp_num, gid->raw);
+
+	return err;
+}
+
+static int init_node_data(struct mlx5_ib_dev *dev)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+	err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	dev->mdev.rev_id = be32_to_cpup((__be32 *)(out_mad->data + 32));
+	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
+			     char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+
+	return sprintf(buf, "%d\n", dev->mdev.priv.fw_pages);
+}
+
+static ssize_t show_reg_pages(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+
+	return sprintf(buf, "%d\n", dev->mdev.priv.reg_pages);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "MT%d\n", dev->mdev.pdev->device);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%d.%d.%d\n", fw_rev_maj(&dev->mdev),
+		       fw_rev_min(&dev->mdev), fw_rev_sub(&dev->mdev));
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%x\n", dev->mdev.rev_id);
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
+		       dev->mdev.board_id);
+}
+
+static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
+static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
+
+static struct device_attribute *mlx5_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+	&dev_attr_fw_pages,
+	&dev_attr_reg_pages,
+};
+
+static void mlx5_ib_event(struct mlx5_core_dev *dev, enum mlx5_dev_event event,
+			  void *data)
+{
+	struct mlx5_ib_dev *ibdev = container_of(dev, struct mlx5_ib_dev, mdev);
+	struct ib_event ibev;
+	u8 port = 0;
+
+	switch (event) {
+	case MLX5_DEV_EVENT_SYS_ERROR:
+		ibdev->ib_active = false;
+		ibev.event = IB_EVENT_DEVICE_FATAL;
+		break;
+
+	case MLX5_DEV_EVENT_PORT_UP:
+		ibev.event = IB_EVENT_PORT_ACTIVE;
+		port = *(u8 *)data;
+		break;
+
+	case MLX5_DEV_EVENT_PORT_DOWN:
+		ibev.event = IB_EVENT_PORT_ERR;
+		port = *(u8 *)data;
+		break;
+
+	case MLX5_DEV_EVENT_PORT_INITIALIZED:
+		/* not used by ULPs */
+		return;
+
+	case MLX5_DEV_EVENT_LID_CHANGE:
+		ibev.event = IB_EVENT_LID_CHANGE;
+		port = *(u8 *)data;
+		break;
+
+	case MLX5_DEV_EVENT_PKEY_CHANGE:
+		ibev.event = IB_EVENT_PKEY_CHANGE;
+		port = *(u8 *)data;
+		break;
+
+	case MLX5_DEV_EVENT_GUID_CHANGE:
+		ibev.event = IB_EVENT_GID_CHANGE;
+		port = *(u8 *)data;
+		break;
+
+	case MLX5_DEV_EVENT_CLIENT_REREG:
+		ibev.event = IB_EVENT_CLIENT_REREGISTER;
+		port = *(u8 *)data;
+		break;
+	}
+
+	ibev.device	      = &ibdev->ib_dev;
+	ibev.element.port_num = port;
+
+	if (ibdev->ib_active)
+		ib_dispatch_event(&ibev);
+}
+
+static void get_ext_port_caps(struct mlx5_ib_dev *dev)
+{
+	int port;
+
+	for (port = 1; port <= dev->mdev.caps.num_ports; port++)
+		mlx5_query_ext_port_caps(dev, port);
+}
+
+static int get_port_caps(struct mlx5_ib_dev *dev)
+{
+	struct ib_device_attr *dprops = NULL;
+	struct ib_port_attr *pprops = NULL;
+	int err = 0;
+	int port;
+
+	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
+	if (!pprops)
+		goto out;
+
+	dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
+	if (!dprops)
+		goto out;
+
+	err = mlx5_ib_query_device(&dev->ib_dev, dprops);
+	if (err) {
+		mlx5_ib_warn(dev, "query_device failed %d\n", err);
+		goto out;
+	}
+
+	for (port = 1; port <= dev->mdev.caps.num_ports; port++) {
+		err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
+		if (err) {
+			mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err);
+			break;
+		}
+		dev->mdev.caps.port[port - 1].pkey_table_len = dprops->max_pkeys;
+		dev->mdev.caps.port[port - 1].gid_table_len = pprops->gid_tbl_len;
+		mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
+			    dprops->max_pkeys, pprops->gid_tbl_len);
+	}
+
+out:
+	kfree(pprops);
+	kfree(dprops);
+
+	return err;
+}
+
+static void destroy_umrc_res(struct mlx5_ib_dev *dev)
+{
+	int err;
+
+	err = mlx5_mr_cache_cleanup(dev);
+	if (err)
+		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
+
+	mlx5_ib_destroy_qp(dev->umrc.qp);
+	ib_destroy_cq(dev->umrc.cq);
+	ib_dereg_mr(dev->umrc.mr);
+	ib_dealloc_pd(dev->umrc.pd);
+}
+
+enum {
+	MAX_UMR_WR = 128,
+};
+
+static int create_umr_res(struct mlx5_ib_dev *dev)
+{
+	struct ib_qp_init_attr *init_attr = NULL;
+	struct ib_qp_attr *attr = NULL;
+	struct ib_pd *pd;
+	struct ib_cq *cq;
+	struct ib_qp *qp;
+	struct ib_mr *mr;
+	int ret;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
+	if (!attr || !init_attr) {
+		ret = -ENOMEM;
+		goto error_0;
+	}
+
+	pd = ib_alloc_pd(&dev->ib_dev);
+	if (IS_ERR(pd)) {
+		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
+		ret = PTR_ERR(pd);
+		goto error_0;
+	}
+
+	mr = ib_get_dma_mr(pd,  IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(mr)) {
+		mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n");
+		ret = PTR_ERR(mr);
+		goto error_1;
+	}
+
+	cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL, 128,
+			  0);
+	if (IS_ERR(cq)) {
+		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
+		ret = PTR_ERR(cq);
+		goto error_2;
+	}
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+
+	init_attr->send_cq = cq;
+	init_attr->recv_cq = cq;
+	init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+	init_attr->cap.max_send_wr = MAX_UMR_WR;
+	init_attr->cap.max_send_sge = 1;
+	init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
+	init_attr->port_num = 1;
+	qp = mlx5_ib_create_qp(pd, init_attr, NULL);
+	if (IS_ERR(qp)) {
+		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
+		ret = PTR_ERR(qp);
+		goto error_3;
+	}
+	qp->device     = &dev->ib_dev;
+	qp->real_qp    = qp;
+	qp->uobject    = NULL;
+	qp->qp_type    = MLX5_IB_QPT_REG_UMR;
+
+	attr->qp_state = IB_QPS_INIT;
+	attr->port_num = 1;
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
+				IB_QP_PORT, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
+		goto error_4;
+	}
+
+	memset(attr, 0, sizeof(*attr));
+	attr->qp_state = IB_QPS_RTR;
+	attr->path_mtu = IB_MTU_256;
+
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
+		goto error_4;
+	}
+
+	memset(attr, 0, sizeof(*attr));
+	attr->qp_state = IB_QPS_RTS;
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
+		goto error_4;
+	}
+
+	dev->umrc.qp = qp;
+	dev->umrc.cq = cq;
+	dev->umrc.mr = mr;
+	dev->umrc.pd = pd;
+
+	sema_init(&dev->umrc.sem, MAX_UMR_WR);
+	ret = mlx5_mr_cache_init(dev);
+	if (ret) {
+		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
+		goto error_4;
+	}
+
+	kfree(attr);
+	kfree(init_attr);
+
+	return 0;
+
+error_4:
+	mlx5_ib_destroy_qp(qp);
+
+error_3:
+	ib_destroy_cq(cq);
+
+error_2:
+	ib_dereg_mr(mr);
+
+error_1:
+	ib_dealloc_pd(pd);
+
+error_0:
+	kfree(attr);
+	kfree(init_attr);
+	return ret;
+}
+
+static int create_dev_resources(struct mlx5_ib_resources *devr)
+{
+	struct ib_srq_init_attr attr;
+	struct mlx5_ib_dev *dev;
+	int ret = 0;
+
+	dev = container_of(devr, struct mlx5_ib_dev, devr);
+
+	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
+	if (IS_ERR(devr->p0)) {
+		ret = PTR_ERR(devr->p0);
+		goto error0;
+	}
+	devr->p0->device  = &dev->ib_dev;
+	devr->p0->uobject = NULL;
+	atomic_set(&devr->p0->usecnt, 0);
+
+	devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, 1, 0, NULL, NULL);
+	if (IS_ERR(devr->c0)) {
+		ret = PTR_ERR(devr->c0);
+		goto error1;
+	}
+	devr->c0->device        = &dev->ib_dev;
+	devr->c0->uobject       = NULL;
+	devr->c0->comp_handler  = NULL;
+	devr->c0->event_handler = NULL;
+	devr->c0->cq_context    = NULL;
+	atomic_set(&devr->c0->usecnt, 0);
+
+	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
+	if (IS_ERR(devr->x0)) {
+		ret = PTR_ERR(devr->x0);
+		goto error2;
+	}
+	devr->x0->device = &dev->ib_dev;
+	devr->x0->inode = NULL;
+	atomic_set(&devr->x0->usecnt, 0);
+	mutex_init(&devr->x0->tgt_qp_mutex);
+	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
+
+	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
+	if (IS_ERR(devr->x1)) {
+		ret = PTR_ERR(devr->x1);
+		goto error3;
+	}
+	devr->x1->device = &dev->ib_dev;
+	devr->x1->inode = NULL;
+	atomic_set(&devr->x1->usecnt, 0);
+	mutex_init(&devr->x1->tgt_qp_mutex);
+	INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.attr.max_sge = 1;
+	attr.attr.max_wr = 1;
+	attr.srq_type = IB_SRQT_XRC;
+	attr.ext.xrc.cq = devr->c0;
+	attr.ext.xrc.xrcd = devr->x0;
+
+	devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
+	if (IS_ERR(devr->s0)) {
+		ret = PTR_ERR(devr->s0);
+		goto error4;
+	}
+	devr->s0->device	= &dev->ib_dev;
+	devr->s0->pd		= devr->p0;
+	devr->s0->uobject       = NULL;
+	devr->s0->event_handler = NULL;
+	devr->s0->srq_context   = NULL;
+	devr->s0->srq_type      = IB_SRQT_XRC;
+	devr->s0->ext.xrc.xrcd	= devr->x0;
+	devr->s0->ext.xrc.cq	= devr->c0;
+	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
+	atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
+	atomic_inc(&devr->p0->usecnt);
+	atomic_set(&devr->s0->usecnt, 0);
+
+	return 0;
+
+error4:
+	mlx5_ib_dealloc_xrcd(devr->x1);
+error3:
+	mlx5_ib_dealloc_xrcd(devr->x0);
+error2:
+	mlx5_ib_destroy_cq(devr->c0);
+error1:
+	mlx5_ib_dealloc_pd(devr->p0);
+error0:
+	return ret;
+}
+
+static void destroy_dev_resources(struct mlx5_ib_resources *devr)
+{
+	mlx5_ib_destroy_srq(devr->s0);
+	mlx5_ib_dealloc_xrcd(devr->x0);
+	mlx5_ib_dealloc_xrcd(devr->x1);
+	mlx5_ib_destroy_cq(devr->c0);
+	mlx5_ib_dealloc_pd(devr->p0);
+}
+
+static int init_one(struct pci_dev *pdev,
+		    const struct pci_device_id *id)
+{
+	struct mlx5_core_dev *mdev;
+	struct mlx5_ib_dev *dev;
+	int err;
+	int i;
+
+	printk_once(KERN_INFO "%s", mlx5_version);
+
+	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
+	if (!dev)
+		return -ENOMEM;
+
+	mdev = &dev->mdev;
+	mdev->event = mlx5_ib_event;
+	if (prof_sel >= ARRAY_SIZE(profile)) {
+		pr_warn("selected pofile out of range, selceting default\n");
+		prof_sel = 0;
+	}
+	mdev->profile = &profile[prof_sel];
+	err = mlx5_dev_init(mdev, pdev);
+	if (err)
+		goto err_free;
+
+	err = get_port_caps(dev);
+	if (err)
+		goto err_cleanup;
+
+	get_ext_port_caps(dev);
+
+	err = alloc_comp_eqs(dev);
+	if (err)
+		goto err_cleanup;
+
+	MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock);
+
+	strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX);
+	dev->ib_dev.owner		= THIS_MODULE;
+	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
+	dev->ib_dev.local_dma_lkey	= mdev->caps.reserved_lkey;
+	dev->num_ports		= mdev->caps.num_ports;
+	dev->ib_dev.phys_port_cnt     = dev->num_ports;
+	dev->ib_dev.num_comp_vectors	= dev->num_comp_vectors;
+	dev->ib_dev.dma_device	= &mdev->pdev->dev;
+
+	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
+	dev->ib_dev.uverbs_cmd_mask	=
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
+		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
+
+	dev->ib_dev.query_device	= mlx5_ib_query_device;
+	dev->ib_dev.query_port		= mlx5_ib_query_port;
+	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
+	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
+	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
+	dev->ib_dev.modify_port		= mlx5_ib_modify_port;
+	dev->ib_dev.alloc_ucontext	= mlx5_ib_alloc_ucontext;
+	dev->ib_dev.dealloc_ucontext	= mlx5_ib_dealloc_ucontext;
+	dev->ib_dev.mmap		= mlx5_ib_mmap;
+	dev->ib_dev.alloc_pd		= mlx5_ib_alloc_pd;
+	dev->ib_dev.dealloc_pd		= mlx5_ib_dealloc_pd;
+	dev->ib_dev.create_ah		= mlx5_ib_create_ah;
+	dev->ib_dev.query_ah		= mlx5_ib_query_ah;
+	dev->ib_dev.destroy_ah		= mlx5_ib_destroy_ah;
+	dev->ib_dev.create_srq		= mlx5_ib_create_srq;
+	dev->ib_dev.modify_srq		= mlx5_ib_modify_srq;
+	dev->ib_dev.query_srq		= mlx5_ib_query_srq;
+	dev->ib_dev.destroy_srq		= mlx5_ib_destroy_srq;
+	dev->ib_dev.post_srq_recv	= mlx5_ib_post_srq_recv;
+	dev->ib_dev.create_qp		= mlx5_ib_create_qp;
+	dev->ib_dev.modify_qp		= mlx5_ib_modify_qp;
+	dev->ib_dev.query_qp		= mlx5_ib_query_qp;
+	dev->ib_dev.destroy_qp		= mlx5_ib_destroy_qp;
+	dev->ib_dev.post_send		= mlx5_ib_post_send;
+	dev->ib_dev.post_recv		= mlx5_ib_post_recv;
+	dev->ib_dev.create_cq		= mlx5_ib_create_cq;
+	dev->ib_dev.modify_cq		= mlx5_ib_modify_cq;
+	dev->ib_dev.resize_cq		= mlx5_ib_resize_cq;
+	dev->ib_dev.destroy_cq		= mlx5_ib_destroy_cq;
+	dev->ib_dev.poll_cq		= mlx5_ib_poll_cq;
+	dev->ib_dev.req_notify_cq	= mlx5_ib_arm_cq;
+	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;
+	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;
+	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr;
+	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
+	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
+	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
+	dev->ib_dev.alloc_fast_reg_mr	= mlx5_ib_alloc_fast_reg_mr;
+	dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list;
+	dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list;
+
+	if (mdev->caps.flags & MLX5_DEV_CAP_FLAG_XRC) {
+		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
+		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
+		dev->ib_dev.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
+			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
+	}
+
+	err = init_node_data(dev);
+	if (err)
+		goto err_eqs;
+
+	mutex_init(&dev->cap_mask_mutex);
+	spin_lock_init(&dev->mr_lock);
+
+	err = create_dev_resources(&dev->devr);
+	if (err)
+		goto err_eqs;
+
+	if (ib_register_device(&dev->ib_dev, NULL))
+		goto err_rsrc;
+
+	err = create_umr_res(dev);
+	if (err)
+		goto err_dev;
+
+	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
+		if (device_create_file(&dev->ib_dev.dev,
+				       mlx5_class_attributes[i]))
+			goto err_umrc;
+	}
+
+	dev->ib_active = true;
+
+	return 0;
+
+err_umrc:
+	destroy_umrc_res(dev);
+
+err_dev:
+	ib_unregister_device(&dev->ib_dev);
+
+err_rsrc:
+	destroy_dev_resources(&dev->devr);
+
+err_eqs:
+	free_comp_eqs(dev);
+
+err_cleanup:
+	mlx5_dev_cleanup(mdev);
+
+err_free:
+	ib_dealloc_device((struct ib_device *)dev);
+
+	return err;
+}
+
+static void remove_one(struct pci_dev *pdev)
+{
+	struct mlx5_ib_dev *dev = mlx5_pci2ibdev(pdev);
+
+	destroy_umrc_res(dev);
+	ib_unregister_device(&dev->ib_dev);
+	destroy_dev_resources(&dev->devr);
+	free_comp_eqs(dev);
+	mlx5_dev_cleanup(&dev->mdev);
+	ib_dealloc_device(&dev->ib_dev);
+}
+
+static DEFINE_PCI_DEVICE_TABLE(mlx5_ib_pci_table) = {
+	{ PCI_VDEVICE(MELLANOX, 4113) }, /* MT4113 Connect-IB */
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mlx5_ib_pci_table);
+
+static struct pci_driver mlx5_ib_driver = {
+	.name		= DRIVER_NAME,
+	.id_table	= mlx5_ib_pci_table,
+	.probe		= init_one,
+	.remove		= remove_one
+};
+
+static int __init mlx5_ib_init(void)
+{
+	return pci_register_driver(&mlx5_ib_driver);
+}
+
+static void __exit mlx5_ib_cleanup(void)
+{
+	pci_unregister_driver(&mlx5_ib_driver);
+}
+
+module_init(mlx5_ib_init);
+module_exit(mlx5_ib_cleanup);
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
new file mode 100644
index 000000000000..3a5322870b96
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <rdma/ib_umem.h>
+#include "mlx5_ib.h"
+
+/* @umem: umem object to scan
+ * @addr: ib virtual address requested by the user
+ * @count: number of PAGE_SIZE pages covered by umem
+ * @shift: page shift for the compound pages found in the region
+ * @ncont: number of compund pages
+ * @order: log2 of the number of compound pages
+ */
+void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
+			int *ncont, int *order)
+{
+	struct ib_umem_chunk *chunk;
+	unsigned long tmp;
+	unsigned long m;
+	int i, j, k;
+	u64 base = 0;
+	int p = 0;
+	int skip;
+	int mask;
+	u64 len;
+	u64 pfn;
+
+	addr = addr >> PAGE_SHIFT;
+	tmp = (unsigned long)addr;
+	m = find_first_bit(&tmp, sizeof(tmp));
+	skip = 1 << m;
+	mask = skip - 1;
+	i = 0;
+	list_for_each_entry(chunk, &umem->chunk_list, list)
+		for (j = 0; j < chunk->nmap; j++) {
+			len = sg_dma_len(&chunk->page_list[j]) >> PAGE_SHIFT;
+			pfn = sg_dma_address(&chunk->page_list[j]) >> PAGE_SHIFT;
+			for (k = 0; k < len; k++) {
+				if (!(i & mask)) {
+					tmp = (unsigned long)pfn;
+					m = min(m, find_first_bit(&tmp, sizeof(tmp)));
+					skip = 1 << m;
+					mask = skip - 1;
+					base = pfn;
+					p = 0;
+				} else {
+					if (base + p != pfn) {
+						tmp = (unsigned long)p;
+						m = find_first_bit(&tmp, sizeof(tmp));
+						skip = 1 << m;
+						mask = skip - 1;
+						base = pfn;
+						p = 0;
+					}
+				}
+				p++;
+				i++;
+			}
+		}
+
+	if (i) {
+		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
+
+		if (order)
+			*order = ilog2(roundup_pow_of_two(i) >> m);
+
+		*ncont = DIV_ROUND_UP(i, (1 << m));
+	} else {
+		m  = 0;
+
+		if (order)
+			*order = 0;
+
+		*ncont = 0;
+	}
+	*shift = PAGE_SHIFT + m;
+	*count = i;
+}
+
+void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+			  int page_shift, __be64 *pas, int umr)
+{
+	int shift = page_shift - PAGE_SHIFT;
+	int mask = (1 << shift) - 1;
+	struct ib_umem_chunk *chunk;
+	int i, j, k;
+	u64 cur = 0;
+	u64 base;
+	int len;
+
+	i = 0;
+	list_for_each_entry(chunk, &umem->chunk_list, list)
+		for (j = 0; j < chunk->nmap; j++) {
+			len = sg_dma_len(&chunk->page_list[j]) >> PAGE_SHIFT;
+			base = sg_dma_address(&chunk->page_list[j]);
+			for (k = 0; k < len; k++) {
+				if (!(i & mask)) {
+					cur = base + (k << PAGE_SHIFT);
+					if (umr)
+						cur |= 3;
+
+					pas[i >> shift] = cpu_to_be64(cur);
+					mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n",
+						    i >> shift, be64_to_cpu(pas[i >> shift]));
+				}  else
+					mlx5_ib_dbg(dev, "=====> 0x%llx\n",
+						    base + (k << PAGE_SHIFT));
+				i++;
+			}
+		}
+}
+
+int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
+{
+	u64 page_size;
+	u64 page_mask;
+	u64 off_size;
+	u64 off_mask;
+	u64 buf_off;
+
+	page_size = 1 << page_shift;
+	page_mask = page_size - 1;
+	buf_off = addr & page_mask;
+	off_size = page_size >> 6;
+	off_mask = off_size - 1;
+
+	if (buf_off & off_mask)
+		return -EINVAL;
+
+	*offset = buf_off >> ilog2(off_size);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
new file mode 100644
index 000000000000..836be9157242
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -0,0 +1,545 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_IB_H
+#define MLX5_IB_H
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/srq.h>
+#include <linux/types.h>
+
+#define mlx5_ib_dbg(dev, format, arg...)				\
+pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
+	 __LINE__, current->pid, ##arg)
+
+#define mlx5_ib_err(dev, format, arg...)				\
+pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
+	__LINE__, current->pid, ##arg)
+
+#define mlx5_ib_warn(dev, format, arg...)				\
+pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
+	__LINE__, current->pid, ##arg)
+
+enum {
+	MLX5_IB_MMAP_CMD_SHIFT	= 8,
+	MLX5_IB_MMAP_CMD_MASK	= 0xff,
+};
+
+enum mlx5_ib_mmap_cmd {
+	MLX5_IB_MMAP_REGULAR_PAGE		= 0,
+	MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES	= 1, /* always last */
+};
+
+enum {
+	MLX5_RES_SCAT_DATA32_CQE	= 0x1,
+	MLX5_RES_SCAT_DATA64_CQE	= 0x2,
+	MLX5_REQ_SCAT_DATA32_CQE	= 0x11,
+	MLX5_REQ_SCAT_DATA64_CQE	= 0x22,
+};
+
+enum mlx5_ib_latency_class {
+	MLX5_IB_LATENCY_CLASS_LOW,
+	MLX5_IB_LATENCY_CLASS_MEDIUM,
+	MLX5_IB_LATENCY_CLASS_HIGH,
+	MLX5_IB_LATENCY_CLASS_FAST_PATH
+};
+
+enum mlx5_ib_mad_ifc_flags {
+	MLX5_MAD_IFC_IGNORE_MKEY	= 1,
+	MLX5_MAD_IFC_IGNORE_BKEY	= 2,
+	MLX5_MAD_IFC_NET_VIEW		= 4,
+};
+
+struct mlx5_ib_ucontext {
+	struct ib_ucontext	ibucontext;
+	struct list_head	db_page_list;
+
+	/* protect doorbell record alloc/free
+	 */
+	struct mutex		db_page_mutex;
+	struct mlx5_uuar_info	uuari;
+};
+
+static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext);
+}
+
+struct mlx5_ib_pd {
+	struct ib_pd		ibpd;
+	u32			pdn;
+	u32			pa_lkey;
+};
+
+/* Use macros here so that don't have to duplicate
+ * enum ib_send_flags and enum ib_qp_type for low-level driver
+ */
+
+#define MLX5_IB_SEND_UMR_UNREG	IB_SEND_RESERVED_START
+#define MLX5_IB_QPT_REG_UMR	IB_QPT_RESERVED1
+#define MLX5_IB_WR_UMR		IB_WR_RESERVED1
+
+struct wr_list {
+	u16	opcode;
+	u16	next;
+};
+
+struct mlx5_ib_wq {
+	u64		       *wrid;
+	u32		       *wr_data;
+	struct wr_list	       *w_list;
+	unsigned	       *wqe_head;
+	u16		        unsig_count;
+
+	/* serialize post to the work queue
+	 */
+	spinlock_t		lock;
+	int			wqe_cnt;
+	int			max_post;
+	int			max_gs;
+	int			offset;
+	int			wqe_shift;
+	unsigned		head;
+	unsigned		tail;
+	u16			cur_post;
+	u16			last_poll;
+	void		       *qend;
+};
+
+enum {
+	MLX5_QP_USER,
+	MLX5_QP_KERNEL,
+	MLX5_QP_EMPTY
+};
+
+struct mlx5_ib_qp {
+	struct ib_qp		ibqp;
+	struct mlx5_core_qp	mqp;
+	struct mlx5_buf		buf;
+
+	struct mlx5_db		db;
+	struct mlx5_ib_wq	rq;
+
+	u32			doorbell_qpn;
+	u8			sq_signal_bits;
+	u8			fm_cache;
+	int			sq_max_wqes_per_wr;
+	int			sq_spare_wqes;
+	struct mlx5_ib_wq	sq;
+
+	struct ib_umem	       *umem;
+	int			buf_size;
+
+	/* serialize qp state modifications
+	 */
+	struct mutex		mutex;
+	u16			xrcdn;
+	u32			flags;
+	u8			port;
+	u8			alt_port;
+	u8			atomic_rd_en;
+	u8			resp_depth;
+	u8			state;
+	int			mlx_type;
+	int			wq_sig;
+	int			scat_cqe;
+	int			max_inline_data;
+	struct mlx5_bf	       *bf;
+	int			has_rq;
+
+	/* only for user space QPs. For kernel
+	 * we have it from the bf object
+	 */
+	int			uuarn;
+
+	int			create_type;
+	u32			pa_lkey;
+};
+
+struct mlx5_ib_cq_buf {
+	struct mlx5_buf		buf;
+	struct ib_umem		*umem;
+	int			cqe_size;
+};
+
+enum mlx5_ib_qp_flags {
+	MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK     = 1 << 0,
+	MLX5_IB_QP_SIGNATURE_HANDLING           = 1 << 1,
+};
+
+struct mlx5_shared_mr_info {
+	int mr_id;
+	struct ib_umem		*umem;
+};
+
+struct mlx5_ib_cq {
+	struct ib_cq		ibcq;
+	struct mlx5_core_cq	mcq;
+	struct mlx5_ib_cq_buf	buf;
+	struct mlx5_db		db;
+
+	/* serialize access to the CQ
+	 */
+	spinlock_t		lock;
+
+	/* protect resize cq
+	 */
+	struct mutex		resize_mutex;
+	struct mlx5_ib_cq_resize *resize_buf;
+	struct ib_umem	       *resize_umem;
+	int			cqe_size;
+};
+
+struct mlx5_ib_srq {
+	struct ib_srq		ibsrq;
+	struct mlx5_core_srq	msrq;
+	struct mlx5_buf		buf;
+	struct mlx5_db		db;
+	u64		       *wrid;
+	/* protect SRQ hanlding
+	 */
+	spinlock_t		lock;
+	int			head;
+	int			tail;
+	u16			wqe_ctr;
+	struct ib_umem	       *umem;
+	/* serialize arming a SRQ
+	 */
+	struct mutex		mutex;
+	int			wq_sig;
+};
+
+struct mlx5_ib_xrcd {
+	struct ib_xrcd		ibxrcd;
+	u32			xrcdn;
+};
+
+struct mlx5_ib_mr {
+	struct ib_mr		ibmr;
+	struct mlx5_core_mr	mmr;
+	struct ib_umem	       *umem;
+	struct mlx5_shared_mr_info	*smr_info;
+	struct list_head	list;
+	int			order;
+	int			umred;
+	__be64			*pas;
+	dma_addr_t		dma;
+	int			npages;
+	struct completion	done;
+	enum ib_wc_status	status;
+};
+
+struct mlx5_ib_fast_reg_page_list {
+	struct ib_fast_reg_page_list	ibfrpl;
+	__be64			       *mapped_page_list;
+	dma_addr_t			map;
+};
+
+struct umr_common {
+	struct ib_pd	*pd;
+	struct ib_cq	*cq;
+	struct ib_qp	*qp;
+	struct ib_mr	*mr;
+	/* control access to UMR QP
+	 */
+	struct semaphore	sem;
+};
+
+enum {
+	MLX5_FMR_INVALID,
+	MLX5_FMR_VALID,
+	MLX5_FMR_BUSY,
+};
+
+struct mlx5_ib_fmr {
+	struct ib_fmr			ibfmr;
+	struct mlx5_core_mr		mr;
+	int				access_flags;
+	int				state;
+	/* protect fmr state
+	 */
+	spinlock_t			lock;
+	u64				wrid;
+	struct ib_send_wr		wr[2];
+	u8				page_shift;
+	struct ib_fast_reg_page_list	page_list;
+};
+
+struct mlx5_cache_ent {
+	struct list_head	head;
+	/* sync access to the cahce entry
+	 */
+	spinlock_t		lock;
+
+
+	struct dentry	       *dir;
+	char                    name[4];
+	u32                     order;
+	u32			size;
+	u32                     cur;
+	u32                     miss;
+	u32			limit;
+
+	struct dentry          *fsize;
+	struct dentry          *fcur;
+	struct dentry          *fmiss;
+	struct dentry          *flimit;
+
+	struct mlx5_ib_dev     *dev;
+	struct work_struct	work;
+	struct delayed_work	dwork;
+};
+
+struct mlx5_mr_cache {
+	struct workqueue_struct *wq;
+	struct mlx5_cache_ent	ent[MAX_MR_CACHE_ENTRIES];
+	int			stopped;
+	struct dentry		*root;
+	unsigned long		last_add;
+};
+
+struct mlx5_ib_resources {
+	struct ib_cq	*c0;
+	struct ib_xrcd	*x0;
+	struct ib_xrcd	*x1;
+	struct ib_pd	*p0;
+	struct ib_srq	*s0;
+};
+
+struct mlx5_ib_dev {
+	struct ib_device		ib_dev;
+	struct mlx5_core_dev		mdev;
+	MLX5_DECLARE_DOORBELL_LOCK(uar_lock);
+	struct list_head		eqs_list;
+	int				num_ports;
+	int				num_comp_vectors;
+	/* serialize update of capability mask
+	 */
+	struct mutex			cap_mask_mutex;
+	bool				ib_active;
+	struct umr_common		umrc;
+	/* sync used page count stats
+	 */
+	spinlock_t			mr_lock;
+	struct mlx5_ib_resources	devr;
+	struct mlx5_mr_cache		cache;
+};
+
+static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
+{
+	return container_of(mcq, struct mlx5_ib_cq, mcq);
+}
+
+static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd)
+{
+	return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd);
+}
+
+static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mlx5_ib_dev, ib_dev);
+}
+
+static inline struct mlx5_ib_fmr *to_mfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct mlx5_ib_fmr, ibfmr);
+}
+
+static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mlx5_ib_cq, ibcq);
+}
+
+static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp)
+{
+	return container_of(mqp, struct mlx5_ib_qp, mqp);
+}
+
+static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mlx5_ib_pd, ibpd);
+}
+
+static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct mlx5_ib_srq, ibsrq);
+}
+
+static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mlx5_ib_qp, ibqp);
+}
+
+static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq)
+{
+	return container_of(msrq, struct mlx5_ib_srq, msrq);
+}
+
+static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mlx5_ib_mr, ibmr);
+}
+
+static inline struct mlx5_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl)
+{
+	return container_of(ibfrpl, struct mlx5_ib_fast_reg_page_list, ibfrpl);
+}
+
+struct mlx5_ib_ah {
+	struct ib_ah		ibah;
+	struct mlx5_av		av;
+};
+
+static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mlx5_ib_ah, ibah);
+}
+
+static inline struct mlx5_ib_dev *mlx5_core2ibdev(struct mlx5_core_dev *dev)
+{
+	return container_of(dev, struct mlx5_ib_dev, mdev);
+}
+
+static inline struct mlx5_ib_dev *mlx5_pci2ibdev(struct pci_dev *pdev)
+{
+	return mlx5_core2ibdev(pci2mlx5_core_dev(pdev));
+}
+
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+			struct mlx5_db *db);
+void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db);
+void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
+void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
+void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index);
+int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad);
+struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
+			   struct mlx5_ib_ah *ah);
+struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
+int mlx5_ib_destroy_ah(struct ib_ah *ah);
+struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata);
+int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr);
+int mlx5_ib_destroy_srq(struct ib_srq *srq);
+int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr);
+struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata);
+int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata);
+int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr);
+int mlx5_ib_destroy_qp(struct ib_qp *qp);
+int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n);
+struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,
+				int vector, struct ib_ucontext *context,
+				struct ib_udata *udata);
+int mlx5_ib_destroy_cq(struct ib_cq *cq);
+int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
+struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
+struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata);
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
+struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+					int max_page_list_len);
+struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+							       int page_list_len);
+void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+struct ib_fmr *mlx5_ib_fmr_alloc(struct ib_pd *pd, int acc,
+				 struct ib_fmr_attr *fmr_attr);
+int mlx5_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		      int npages, u64 iova);
+int mlx5_ib_unmap_fmr(struct list_head *fmr_list);
+int mlx5_ib_fmr_dealloc(struct ib_fmr *ibfmr);
+int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad);
+struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
+					  struct ib_ucontext *context,
+					  struct ib_udata *udata);
+int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd);
+int mlx5_vector2eqn(struct mlx5_ib_dev *dev, int vector, int *eqn, int *irqn);
+int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset);
+int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port);
+int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
+		       struct ib_port_attr *props);
+int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev);
+void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev);
+void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
+			int *ncont, int *order);
+void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+			  int page_shift, __be64 *pas, int umr);
+void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
+int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq);
+int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
+int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift);
+void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context);
+
+static inline void init_query_mad(struct ib_smp *mad)
+{
+	mad->base_version  = 1;
+	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	mad->class_version = 1;
+	mad->method	   = IB_MGMT_METHOD_GET;
+}
+
+static inline u8 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
+	       MLX5_PERM_LOCAL_READ;
+}
+
+#endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
new file mode 100644
index 000000000000..e2daa8f02476
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -0,0 +1,1007 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <linux/kref.h>
+#include <linux/random.h>
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <rdma/ib_umem.h>
+#include "mlx5_ib.h"
+
+enum {
+	DEF_CACHE_SIZE	= 10,
+};
+
+static __be64 *mr_align(__be64 *ptr, int align)
+{
+	unsigned long mask = align - 1;
+
+	return (__be64 *)(((unsigned long)ptr + mask) & ~mask);
+}
+
+static int order2idx(struct mlx5_ib_dev *dev, int order)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+
+	if (order < cache->ent[0].order)
+		return 0;
+	else
+		return order - cache->ent[0].order;
+}
+
+static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
+{
+	struct device *ddev = dev->ib_dev.dma_device;
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_ib_mr *mr;
+	int npages = 1 << ent->order;
+	int size = sizeof(u64) * npages;
+	int err = 0;
+	int i;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	for (i = 0; i < num; i++) {
+		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+		if (!mr) {
+			err = -ENOMEM;
+			goto out;
+		}
+		mr->order = ent->order;
+		mr->umred = 1;
+		mr->pas = kmalloc(size + 0x3f, GFP_KERNEL);
+		if (!mr->pas) {
+			kfree(mr);
+			err = -ENOMEM;
+			goto out;
+		}
+		mr->dma = dma_map_single(ddev, mr_align(mr->pas, 0x40), size,
+					 DMA_TO_DEVICE);
+		if (dma_mapping_error(ddev, mr->dma)) {
+			kfree(mr->pas);
+			kfree(mr);
+			err = -ENOMEM;
+			goto out;
+		}
+
+		in->seg.status = 1 << 6;
+		in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);
+		in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+		in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;
+		in->seg.log2_page_size = 12;
+
+		err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in,
+					    sizeof(*in));
+		if (err) {
+			mlx5_ib_warn(dev, "create mkey failed %d\n", err);
+			dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE);
+			kfree(mr->pas);
+			kfree(mr);
+			goto out;
+		}
+		cache->last_add = jiffies;
+
+		spin_lock(&ent->lock);
+		list_add_tail(&mr->list, &ent->head);
+		ent->cur++;
+		ent->size++;
+		spin_unlock(&ent->lock);
+	}
+
+out:
+	kfree(in);
+	return err;
+}
+
+static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
+{
+	struct device *ddev = dev->ib_dev.dma_device;
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	struct mlx5_ib_mr *mr;
+	int size;
+	int err;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		spin_lock(&ent->lock);
+		if (list_empty(&ent->head)) {
+			spin_unlock(&ent->lock);
+			return;
+		}
+		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
+		list_del(&mr->list);
+		ent->cur--;
+		ent->size--;
+		spin_unlock(&ent->lock);
+		err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr);
+		if (err) {
+			mlx5_ib_warn(dev, "failed destroy mkey\n");
+		} else {
+			size = ALIGN(sizeof(u64) * (1 << mr->order), 0x40);
+			dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE);
+			kfree(mr->pas);
+			kfree(mr);
+		}
+	}
+}
+
+static ssize_t size_write(struct file *filp, const char __user *buf,
+			  size_t count, loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	struct mlx5_ib_dev *dev = ent->dev;
+	char lbuf[20];
+	u32 var;
+	int err;
+	int c;
+
+	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
+		return -EPERM;
+
+	c = order2idx(dev, ent->order);
+	lbuf[sizeof(lbuf) - 1] = 0;
+
+	if (sscanf(lbuf, "%u", &var) != 1)
+		return -EINVAL;
+
+	if (var < ent->limit)
+		return -EINVAL;
+
+	if (var > ent->size) {
+		err = add_keys(dev, c, var - ent->size);
+		if (err)
+			return err;
+	} else if (var < ent->size) {
+		remove_keys(dev, c, ent->size - var);
+	}
+
+	return count;
+}
+
+static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
+			 loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	char lbuf[20];
+	int err;
+
+	if (*pos)
+		return 0;
+
+	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size);
+	if (err < 0)
+		return err;
+
+	if (copy_to_user(buf, lbuf, err))
+		return -EPERM;
+
+	*pos += err;
+
+	return err;
+}
+
+static const struct file_operations size_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= size_write,
+	.read	= size_read,
+};
+
+static ssize_t limit_write(struct file *filp, const char __user *buf,
+			   size_t count, loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	struct mlx5_ib_dev *dev = ent->dev;
+	char lbuf[20];
+	u32 var;
+	int err;
+	int c;
+
+	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
+		return -EPERM;
+
+	c = order2idx(dev, ent->order);
+	lbuf[sizeof(lbuf) - 1] = 0;
+
+	if (sscanf(lbuf, "%u", &var) != 1)
+		return -EINVAL;
+
+	if (var > ent->size)
+		return -EINVAL;
+
+	ent->limit = var;
+
+	if (ent->cur < ent->limit) {
+		err = add_keys(dev, c, 2 * ent->limit - ent->cur);
+		if (err)
+			return err;
+	}
+
+	return count;
+}
+
+static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
+			  loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	char lbuf[20];
+	int err;
+
+	if (*pos)
+		return 0;
+
+	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
+	if (err < 0)
+		return err;
+
+	if (copy_to_user(buf, lbuf, err))
+		return -EPERM;
+
+	*pos += err;
+
+	return err;
+}
+
+static const struct file_operations limit_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= limit_write,
+	.read	= limit_read,
+};
+
+static int someone_adding(struct mlx5_mr_cache *cache)
+{
+	int i;
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		if (cache->ent[i].cur < cache->ent[i].limit)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void __cache_work_func(struct mlx5_cache_ent *ent)
+{
+	struct mlx5_ib_dev *dev = ent->dev;
+	struct mlx5_mr_cache *cache = &dev->cache;
+	int i = order2idx(dev, ent->order);
+
+	if (cache->stopped)
+		return;
+
+	ent = &dev->cache.ent[i];
+	if (ent->cur < 2 * ent->limit) {
+		add_keys(dev, i, 1);
+		if (ent->cur < 2 * ent->limit)
+			queue_work(cache->wq, &ent->work);
+	} else if (ent->cur > 2 * ent->limit) {
+		if (!someone_adding(cache) &&
+		    time_after(jiffies, cache->last_add + 60 * HZ)) {
+			remove_keys(dev, i, 1);
+			if (ent->cur > ent->limit)
+				queue_work(cache->wq, &ent->work);
+		} else {
+			queue_delayed_work(cache->wq, &ent->dwork, 60 * HZ);
+		}
+	}
+}
+
+static void delayed_cache_work_func(struct work_struct *work)
+{
+	struct mlx5_cache_ent *ent;
+
+	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
+	__cache_work_func(ent);
+}
+
+static void cache_work_func(struct work_struct *work)
+{
+	struct mlx5_cache_ent *ent;
+
+	ent = container_of(work, struct mlx5_cache_ent, work);
+	__cache_work_func(ent);
+}
+
+static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_ib_mr *mr = NULL;
+	struct mlx5_cache_ent *ent;
+	int c;
+	int i;
+
+	c = order2idx(dev, order);
+	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
+		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
+		return NULL;
+	}
+
+	for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) {
+		ent = &cache->ent[i];
+
+		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
+
+		spin_lock(&ent->lock);
+		if (!list_empty(&ent->head)) {
+			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
+					      list);
+			list_del(&mr->list);
+			ent->cur--;
+			spin_unlock(&ent->lock);
+			if (ent->cur < ent->limit)
+				queue_work(cache->wq, &ent->work);
+			break;
+		}
+		spin_unlock(&ent->lock);
+
+		queue_work(cache->wq, &ent->work);
+
+		if (mr)
+			break;
+	}
+
+	if (!mr)
+		cache->ent[c].miss++;
+
+	return mr;
+}
+
+static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	int shrink = 0;
+	int c;
+
+	c = order2idx(dev, mr->order);
+	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
+		mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
+		return;
+	}
+	ent = &cache->ent[c];
+	spin_lock(&ent->lock);
+	list_add_tail(&mr->list, &ent->head);
+	ent->cur++;
+	if (ent->cur > 2 * ent->limit)
+		shrink = 1;
+	spin_unlock(&ent->lock);
+
+	if (shrink)
+		queue_work(cache->wq, &ent->work);
+}
+
+static void clean_keys(struct mlx5_ib_dev *dev, int c)
+{
+	struct device *ddev = dev->ib_dev.dma_device;
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	struct mlx5_ib_mr *mr;
+	int size;
+	int err;
+
+	while (1) {
+		spin_lock(&ent->lock);
+		if (list_empty(&ent->head)) {
+			spin_unlock(&ent->lock);
+			return;
+		}
+		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
+		list_del(&mr->list);
+		ent->cur--;
+		ent->size--;
+		spin_unlock(&ent->lock);
+		err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr);
+		if (err) {
+			mlx5_ib_warn(dev, "failed destroy mkey\n");
+		} else {
+			size = ALIGN(sizeof(u64) * (1 << mr->order), 0x40);
+			dma_unmap_single(ddev, mr->dma, size, DMA_TO_DEVICE);
+			kfree(mr->pas);
+			kfree(mr);
+		}
+	}
+}
+
+static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	int i;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	cache->root = debugfs_create_dir("mr_cache", dev->mdev.priv.dbg_root);
+	if (!cache->root)
+		return -ENOMEM;
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		ent = &cache->ent[i];
+		sprintf(ent->name, "%d", ent->order);
+		ent->dir = debugfs_create_dir(ent->name,  cache->root);
+		if (!ent->dir)
+			return -ENOMEM;
+
+		ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent,
+						 &size_fops);
+		if (!ent->fsize)
+			return -ENOMEM;
+
+		ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent,
+						  &limit_fops);
+		if (!ent->flimit)
+			return -ENOMEM;
+
+		ent->fcur = debugfs_create_u32("cur", 0400, ent->dir,
+					       &ent->cur);
+		if (!ent->fcur)
+			return -ENOMEM;
+
+		ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir,
+						&ent->miss);
+		if (!ent->fmiss)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->cache.root);
+}
+
+int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	int limit;
+	int size;
+	int err;
+	int i;
+
+	cache->wq = create_singlethread_workqueue("mkey_cache");
+	if (!cache->wq) {
+		mlx5_ib_warn(dev, "failed to create work queue\n");
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		INIT_LIST_HEAD(&cache->ent[i].head);
+		spin_lock_init(&cache->ent[i].lock);
+
+		ent = &cache->ent[i];
+		INIT_LIST_HEAD(&ent->head);
+		spin_lock_init(&ent->lock);
+		ent->order = i + 2;
+		ent->dev = dev;
+
+		if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE) {
+			size = dev->mdev.profile->mr_cache[i].size;
+			limit = dev->mdev.profile->mr_cache[i].limit;
+		} else {
+			size = DEF_CACHE_SIZE;
+			limit = 0;
+		}
+		INIT_WORK(&ent->work, cache_work_func);
+		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
+		ent->limit = limit;
+		queue_work(cache->wq, &ent->work);
+	}
+
+	err = mlx5_mr_cache_debugfs_init(dev);
+	if (err)
+		mlx5_ib_warn(dev, "cache debugfs failure\n");
+
+	return 0;
+}
+
+int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
+{
+	int i;
+
+	dev->cache.stopped = 1;
+	destroy_workqueue(dev->cache.wq);
+
+	mlx5_mr_cache_debugfs_cleanup(dev);
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
+		clean_keys(dev, i);
+
+	return 0;
+}
+
+struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_core_dev *mdev = &dev->mdev;
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_mkey_seg *seg;
+	struct mlx5_ib_mr *mr;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	seg = &in->seg;
+	seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA;
+	seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64);
+	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	seg->start_addr = 0;
+
+	err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in));
+	if (err)
+		goto err_in;
+
+	kfree(in);
+	mr->ibmr.lkey = mr->mmr.key;
+	mr->ibmr.rkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_in:
+	kfree(in);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+static int get_octo_len(u64 addr, u64 len, int page_size)
+{
+	u64 offset;
+	int npages;
+
+	offset = addr & (page_size - 1);
+	npages = ALIGN(len + offset, page_size) >> ilog2(page_size);
+	return (npages + 1) / 2;
+}
+
+static int use_umr(int order)
+{
+	return order <= 17;
+}
+
+static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
+			     struct ib_sge *sg, u64 dma, int n, u32 key,
+			     int page_shift, u64 virt_addr, u64 len,
+			     int access_flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct ib_mr *mr = dev->umrc.mr;
+
+	sg->addr = dma;
+	sg->length = ALIGN(sizeof(u64) * n, 64);
+	sg->lkey = mr->lkey;
+
+	wr->next = NULL;
+	wr->send_flags = 0;
+	wr->sg_list = sg;
+	if (n)
+		wr->num_sge = 1;
+	else
+		wr->num_sge = 0;
+
+	wr->opcode = MLX5_IB_WR_UMR;
+	wr->wr.fast_reg.page_list_len = n;
+	wr->wr.fast_reg.page_shift = page_shift;
+	wr->wr.fast_reg.rkey = key;
+	wr->wr.fast_reg.iova_start = virt_addr;
+	wr->wr.fast_reg.length = len;
+	wr->wr.fast_reg.access_flags = access_flags;
+	wr->wr.fast_reg.page_list = (struct ib_fast_reg_page_list *)pd;
+}
+
+static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
+			       struct ib_send_wr *wr, u32 key)
+{
+	wr->send_flags = MLX5_IB_SEND_UMR_UNREG;
+	wr->opcode = MLX5_IB_WR_UMR;
+	wr->wr.fast_reg.rkey = key;
+}
+
+void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct mlx5_ib_mr *mr;
+	struct ib_wc wc;
+	int err;
+
+	while (1) {
+		err = ib_poll_cq(cq, 1, &wc);
+		if (err < 0) {
+			pr_warn("poll cq error %d\n", err);
+			return;
+		}
+		if (err == 0)
+			break;
+
+		mr = (struct mlx5_ib_mr *)(unsigned long)wc.wr_id;
+		mr->status = wc.status;
+		complete(&mr->done);
+	}
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+}
+
+static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
+				  u64 virt_addr, u64 len, int npages,
+				  int page_shift, int order, int access_flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct umr_common *umrc = &dev->umrc;
+	struct ib_send_wr wr, *bad;
+	struct mlx5_ib_mr *mr;
+	struct ib_sge sg;
+	int err;
+	int i;
+
+	for (i = 0; i < 10; i++) {
+		mr = alloc_cached_mr(dev, order);
+		if (mr)
+			break;
+
+		err = add_keys(dev, order2idx(dev, order), 1);
+		if (err) {
+			mlx5_ib_warn(dev, "add_keys failed\n");
+			break;
+		}
+	}
+
+	if (!mr)
+		return ERR_PTR(-EAGAIN);
+
+	mlx5_ib_populate_pas(dev, umem, page_shift, mr_align(mr->pas, 0x40), 1);
+
+	memset(&wr, 0, sizeof(wr));
+	wr.wr_id = (u64)(unsigned long)mr;
+	prep_umr_reg_wqe(pd, &wr, &sg, mr->dma, npages, mr->mmr.key, page_shift, virt_addr, len, access_flags);
+
+	/* We serialize polls so one process does not kidnap another's
+	 * completion. This is not a problem since wr is completed in
+	 * around 1 usec
+	 */
+	down(&umrc->sem);
+	init_completion(&mr->done);
+	err = ib_post_send(umrc->qp, &wr, &bad);
+	if (err) {
+		mlx5_ib_warn(dev, "post send failed, err %d\n", err);
+		up(&umrc->sem);
+		goto error;
+	}
+	wait_for_completion(&mr->done);
+	up(&umrc->sem);
+
+	if (mr->status != IB_WC_SUCCESS) {
+		mlx5_ib_warn(dev, "reg umr failed\n");
+		err = -EFAULT;
+		goto error;
+	}
+
+	return mr;
+
+error:
+	free_cached_mr(dev, mr);
+	return ERR_PTR(err);
+}
+
+static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
+				     u64 length, struct ib_umem *umem,
+				     int npages, int page_shift,
+				     int access_flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_ib_mr *mr;
+	int inlen;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2;
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_1;
+	}
+	mlx5_ib_populate_pas(dev, umem, page_shift, in->pas, 0);
+
+	in->seg.flags = convert_access(access_flags) |
+		MLX5_ACCESS_MODE_MTT;
+	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
+	in->seg.start_addr = cpu_to_be64(virt_addr);
+	in->seg.len = cpu_to_be64(length);
+	in->seg.bsfs_octo_size = 0;
+	in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
+	in->seg.log2_page_size = page_shift;
+	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
+	err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen);
+	if (err) {
+		mlx5_ib_warn(dev, "create mkey failed\n");
+		goto err_2;
+	}
+	mr->umem = umem;
+	mlx5_vfree(in);
+
+	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
+
+	return mr;
+
+err_2:
+	mlx5_vfree(in);
+
+err_1:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_mr *mr = NULL;
+	struct ib_umem *umem;
+	int page_shift;
+	int npages;
+	int ncont;
+	int order;
+	int err;
+
+	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx\n",
+		    start, virt_addr, length);
+	umem = ib_umem_get(pd->uobject->context, start, length, access_flags,
+			   0);
+	if (IS_ERR(umem)) {
+		mlx5_ib_dbg(dev, "umem get failed\n");
+		return (void *)umem;
+	}
+
+	mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order);
+	if (!npages) {
+		mlx5_ib_warn(dev, "avoid zero region\n");
+		err = -EINVAL;
+		goto error;
+	}
+
+	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
+		    npages, ncont, order, page_shift);
+
+	if (use_umr(order)) {
+		mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift,
+			     order, access_flags);
+		if (PTR_ERR(mr) == -EAGAIN) {
+			mlx5_ib_dbg(dev, "cache empty for order %d", order);
+			mr = NULL;
+		}
+	}
+
+	if (!mr)
+		mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift,
+				access_flags);
+
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+		goto error;
+	}
+
+	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key);
+
+	mr->umem = umem;
+	mr->npages = npages;
+	spin_lock(&dev->mr_lock);
+	dev->mdev.priv.reg_pages += npages;
+	spin_unlock(&dev->mr_lock);
+	mr->ibmr.lkey = mr->mmr.key;
+	mr->ibmr.rkey = mr->mmr.key;
+
+	return &mr->ibmr;
+
+error:
+	ib_umem_release(umem);
+	return ERR_PTR(err);
+}
+
+static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+	struct umr_common *umrc = &dev->umrc;
+	struct ib_send_wr wr, *bad;
+	int err;
+
+	memset(&wr, 0, sizeof(wr));
+	wr.wr_id = (u64)(unsigned long)mr;
+	prep_umr_unreg_wqe(dev, &wr, mr->mmr.key);
+
+	down(&umrc->sem);
+	init_completion(&mr->done);
+	err = ib_post_send(umrc->qp, &wr, &bad);
+	if (err) {
+		up(&umrc->sem);
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		goto error;
+	}
+	wait_for_completion(&mr->done);
+	up(&umrc->sem);
+	if (mr->status != IB_WC_SUCCESS) {
+		mlx5_ib_warn(dev, "unreg umr failed\n");
+		err = -EFAULT;
+		goto error;
+	}
+	return 0;
+
+error:
+	return err;
+}
+
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	struct ib_umem *umem = mr->umem;
+	int npages = mr->npages;
+	int umred = mr->umred;
+	int err;
+
+	if (!umred) {
+		err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr);
+		if (err) {
+			mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
+				     mr->mmr.key, err);
+			return err;
+		}
+	} else {
+		err = unreg_umr(dev, mr);
+		if (err) {
+			mlx5_ib_warn(dev, "failed unregister\n");
+			return err;
+		}
+		free_cached_mr(dev, mr);
+	}
+
+	if (umem) {
+		ib_umem_release(umem);
+		spin_lock(&dev->mr_lock);
+		dev->mdev.priv.reg_pages -= npages;
+		spin_unlock(&dev->mr_lock);
+	}
+
+	if (!umred)
+		kfree(mr);
+
+	return 0;
+}
+
+struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+					int max_page_list_len)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_ib_mr *mr;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	in->seg.status = 1 << 6; /* free */
+	in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2);
+	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT;
+	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
+	/*
+	 * TBD not needed - issue 197292 */
+	in->seg.log2_page_size = PAGE_SHIFT;
+
+	err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in));
+	kfree(in);
+	if (err)
+		goto err_free;
+
+	mr->ibmr.lkey = mr->mmr.key;
+	mr->ibmr.rkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+							       int page_list_len)
+{
+	struct mlx5_ib_fast_reg_page_list *mfrpl;
+	int size = page_list_len * sizeof(u64);
+
+	mfrpl = kmalloc(sizeof(*mfrpl), GFP_KERNEL);
+	if (!mfrpl)
+		return ERR_PTR(-ENOMEM);
+
+	mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
+	if (!mfrpl->ibfrpl.page_list)
+		goto err_free;
+
+	mfrpl->mapped_page_list = dma_alloc_coherent(ibdev->dma_device,
+						     size, &mfrpl->map,
+						     GFP_KERNEL);
+	if (!mfrpl->mapped_page_list)
+		goto err_free;
+
+	WARN_ON(mfrpl->map & 0x3f);
+
+	return &mfrpl->ibfrpl;
+
+err_free:
+	kfree(mfrpl->ibfrpl.page_list);
+	kfree(mfrpl);
+	return ERR_PTR(-ENOMEM);
+}
+
+void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
+	struct mlx5_ib_dev *dev = to_mdev(page_list->device);
+	int size = page_list->max_page_list_len * sizeof(u64);
+
+	dma_free_coherent(&dev->mdev.pdev->dev, size, mfrpl->mapped_page_list,
+			  mfrpl->map);
+	kfree(mfrpl->ibfrpl.page_list);
+	kfree(mfrpl);
+}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
new file mode 100644
index 000000000000..16ac54c9819f
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -0,0 +1,2524 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <rdma/ib_umem.h>
+#include "mlx5_ib.h"
+#include "user.h"
+
+/* not supported currently */
+static int wq_signature;
+
+enum {
+	MLX5_IB_ACK_REQ_FREQ	= 8,
+};
+
+enum {
+	MLX5_IB_DEFAULT_SCHED_QUEUE	= 0x83,
+	MLX5_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
+	MLX5_IB_LINK_TYPE_IB		= 0,
+	MLX5_IB_LINK_TYPE_ETH		= 1
+};
+
+enum {
+	MLX5_IB_SQ_STRIDE	= 6,
+	MLX5_IB_CACHE_LINE_SIZE	= 64,
+};
+
+static const u32 mlx5_ib_opcode[] = {
+	[IB_WR_SEND]				= MLX5_OPCODE_SEND,
+	[IB_WR_SEND_WITH_IMM]			= MLX5_OPCODE_SEND_IMM,
+	[IB_WR_RDMA_WRITE]			= MLX5_OPCODE_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM]		= MLX5_OPCODE_RDMA_WRITE_IMM,
+	[IB_WR_RDMA_READ]			= MLX5_OPCODE_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP]		= MLX5_OPCODE_ATOMIC_CS,
+	[IB_WR_ATOMIC_FETCH_AND_ADD]		= MLX5_OPCODE_ATOMIC_FA,
+	[IB_WR_SEND_WITH_INV]			= MLX5_OPCODE_SEND_INVAL,
+	[IB_WR_LOCAL_INV]			= MLX5_OPCODE_UMR,
+	[IB_WR_FAST_REG_MR]			= MLX5_OPCODE_UMR,
+	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= MLX5_OPCODE_ATOMIC_MASKED_CS,
+	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= MLX5_OPCODE_ATOMIC_MASKED_FA,
+	[MLX5_IB_WR_UMR]			= MLX5_OPCODE_UMR,
+};
+
+struct umr_wr {
+	u64				virt_addr;
+	struct ib_pd		       *pd;
+	unsigned int			page_shift;
+	unsigned int			npages;
+	u32				length;
+	int				access_flags;
+	u32				mkey;
+};
+
+static int is_qp0(enum ib_qp_type qp_type)
+{
+	return qp_type == IB_QPT_SMI;
+}
+
+static int is_qp1(enum ib_qp_type qp_type)
+{
+	return qp_type == IB_QPT_GSI;
+}
+
+static int is_sqp(enum ib_qp_type qp_type)
+{
+	return is_qp0(qp_type) || is_qp1(qp_type);
+}
+
+static void *get_wqe(struct mlx5_ib_qp *qp, int offset)
+{
+	return mlx5_buf_offset(&qp->buf, offset);
+}
+
+static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE));
+}
+
+static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
+{
+	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
+	struct ib_event event;
+
+	if (type == MLX5_EVENT_TYPE_PATH_MIG)
+		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+	if (ibqp->event_handler) {
+		event.device     = ibqp->device;
+		event.element.qp = ibqp;
+		switch (type) {
+		case MLX5_EVENT_TYPE_PATH_MIG:
+			event.event = IB_EVENT_PATH_MIG;
+			break;
+		case MLX5_EVENT_TYPE_COMM_EST:
+			event.event = IB_EVENT_COMM_EST;
+			break;
+		case MLX5_EVENT_TYPE_SQ_DRAINED:
+			event.event = IB_EVENT_SQ_DRAINED;
+			break;
+		case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			break;
+		case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+			event.event = IB_EVENT_QP_FATAL;
+			break;
+		case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+			event.event = IB_EVENT_PATH_MIG_ERR;
+			break;
+		case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			event.event = IB_EVENT_QP_REQ_ERR;
+			break;
+		case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+			event.event = IB_EVENT_QP_ACCESS_ERR;
+			break;
+		default:
+			pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn);
+			return;
+		}
+
+		ibqp->event_handler(&event, ibqp->qp_context);
+	}
+}
+
+static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap,
+		       int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd)
+{
+	int wqe_size;
+	int wq_size;
+
+	/* Sanity check RQ size before proceeding */
+	if (cap->max_recv_wr  > dev->mdev.caps.max_wqes)
+		return -EINVAL;
+
+	if (!has_rq) {
+		qp->rq.max_gs = 0;
+		qp->rq.wqe_cnt = 0;
+		qp->rq.wqe_shift = 0;
+	} else {
+		if (ucmd) {
+			qp->rq.wqe_cnt = ucmd->rq_wqe_count;
+			qp->rq.wqe_shift = ucmd->rq_wqe_shift;
+			qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
+			qp->rq.max_post = qp->rq.wqe_cnt;
+		} else {
+			wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0;
+			wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg);
+			wqe_size = roundup_pow_of_two(wqe_size);
+			wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size;
+			wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB);
+			qp->rq.wqe_cnt = wq_size / wqe_size;
+			if (wqe_size > dev->mdev.caps.max_rq_desc_sz) {
+				mlx5_ib_dbg(dev, "wqe_size %d, max %d\n",
+					    wqe_size,
+					    dev->mdev.caps.max_rq_desc_sz);
+				return -EINVAL;
+			}
+			qp->rq.wqe_shift = ilog2(wqe_size);
+			qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
+			qp->rq.max_post = qp->rq.wqe_cnt;
+		}
+	}
+
+	return 0;
+}
+
+static int sq_overhead(enum ib_qp_type qp_type)
+{
+	int size;
+
+	switch (qp_type) {
+	case IB_QPT_XRC_INI:
+		size = sizeof(struct mlx5_wqe_xrc_seg);
+		/* fall through */
+	case IB_QPT_RC:
+		size += sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_atomic_seg) +
+			sizeof(struct mlx5_wqe_raddr_seg);
+		break;
+
+	case IB_QPT_UC:
+		size = sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_raddr_seg);
+		break;
+
+	case IB_QPT_UD:
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		size = sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_datagram_seg);
+		break;
+
+	case MLX5_IB_QPT_REG_UMR:
+		size = sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_umr_ctrl_seg) +
+			sizeof(struct mlx5_mkey_seg);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return size;
+}
+
+static int calc_send_wqe(struct ib_qp_init_attr *attr)
+{
+	int inl_size = 0;
+	int size;
+
+	size = sq_overhead(attr->qp_type);
+	if (size < 0)
+		return size;
+
+	if (attr->cap.max_inline_data) {
+		inl_size = size + sizeof(struct mlx5_wqe_inline_seg) +
+			attr->cap.max_inline_data;
+	}
+
+	size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg);
+
+	return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB);
+}
+
+static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
+			struct mlx5_ib_qp *qp)
+{
+	int wqe_size;
+	int wq_size;
+
+	if (!attr->cap.max_send_wr)
+		return 0;
+
+	wqe_size = calc_send_wqe(attr);
+	mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size);
+	if (wqe_size < 0)
+		return wqe_size;
+
+	if (wqe_size > dev->mdev.caps.max_sq_desc_sz) {
+		mlx5_ib_dbg(dev, "\n");
+		return -EINVAL;
+	}
+
+	qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) -
+		sizeof(struct mlx5_wqe_inline_seg);
+	attr->cap.max_inline_data = qp->max_inline_data;
+
+	wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size);
+	qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
+	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
+	qp->sq.max_gs = attr->cap.max_send_sge;
+	qp->sq.max_post = 1 << ilog2(wq_size / wqe_size);
+
+	return wq_size;
+}
+
+static int set_user_buf_size(struct mlx5_ib_dev *dev,
+			    struct mlx5_ib_qp *qp,
+			    struct mlx5_ib_create_qp *ucmd)
+{
+	int desc_sz = 1 << qp->sq.wqe_shift;
+
+	if (desc_sz > dev->mdev.caps.max_sq_desc_sz) {
+		mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n",
+			     desc_sz, dev->mdev.caps.max_sq_desc_sz);
+		return -EINVAL;
+	}
+
+	if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) {
+		mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n",
+			     ucmd->sq_wqe_count, ucmd->sq_wqe_count);
+		return -EINVAL;
+	}
+
+	qp->sq.wqe_cnt = ucmd->sq_wqe_count;
+
+	if (qp->sq.wqe_cnt > dev->mdev.caps.max_wqes) {
+		mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n",
+			     qp->sq.wqe_cnt, dev->mdev.caps.max_wqes);
+		return -EINVAL;
+	}
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << 6);
+
+	return 0;
+}
+
+static int qp_has_rq(struct ib_qp_init_attr *attr)
+{
+	if (attr->qp_type == IB_QPT_XRC_INI ||
+	    attr->qp_type == IB_QPT_XRC_TGT || attr->srq ||
+	    attr->qp_type == MLX5_IB_QPT_REG_UMR ||
+	    !attr->cap.max_recv_wr)
+		return 0;
+
+	return 1;
+}
+
+static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari)
+{
+	int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE;
+	int start_uuar;
+	int i;
+
+	start_uuar = nuuars - uuari->num_low_latency_uuars;
+	for (i = start_uuar; i < nuuars; i++) {
+		if (!test_bit(i, uuari->bitmap)) {
+			set_bit(i, uuari->bitmap);
+			uuari->count[i]++;
+			return i;
+		}
+	}
+
+	return -ENOMEM;
+}
+
+static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari)
+{
+	int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE;
+	int minidx = 1;
+	int uuarn;
+	int end;
+	int i;
+
+	end = nuuars - uuari->num_low_latency_uuars;
+
+	for (i = 1; i < end; i++) {
+		uuarn = i & 3;
+		if (uuarn == 2 || uuarn == 3)
+			continue;
+
+		if (uuari->count[i] < uuari->count[minidx])
+			minidx = i;
+	}
+
+	uuari->count[minidx]++;
+	return minidx;
+}
+
+static int alloc_uuar(struct mlx5_uuar_info *uuari,
+		      enum mlx5_ib_latency_class lat)
+{
+	int uuarn = -EINVAL;
+
+	mutex_lock(&uuari->lock);
+	switch (lat) {
+	case MLX5_IB_LATENCY_CLASS_LOW:
+		uuarn = 0;
+		uuari->count[uuarn]++;
+		break;
+
+	case MLX5_IB_LATENCY_CLASS_MEDIUM:
+		uuarn = alloc_med_class_uuar(uuari);
+		break;
+
+	case MLX5_IB_LATENCY_CLASS_HIGH:
+		uuarn = alloc_high_class_uuar(uuari);
+		break;
+
+	case MLX5_IB_LATENCY_CLASS_FAST_PATH:
+		uuarn = 2;
+		break;
+	}
+	mutex_unlock(&uuari->lock);
+
+	return uuarn;
+}
+
+static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+{
+	clear_bit(uuarn, uuari->bitmap);
+	--uuari->count[uuarn];
+}
+
+static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+{
+	clear_bit(uuarn, uuari->bitmap);
+	--uuari->count[uuarn];
+}
+
+static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+{
+	int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE;
+	int high_uuar = nuuars - uuari->num_low_latency_uuars;
+
+	mutex_lock(&uuari->lock);
+	if (uuarn == 0) {
+		--uuari->count[uuarn];
+		goto out;
+	}
+
+	if (uuarn < high_uuar) {
+		free_med_class_uuar(uuari, uuarn);
+		goto out;
+	}
+
+	free_high_class_uuar(uuari, uuarn);
+
+out:
+	mutex_unlock(&uuari->lock);
+}
+
+static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:	return MLX5_QP_STATE_RST;
+	case IB_QPS_INIT:	return MLX5_QP_STATE_INIT;
+	case IB_QPS_RTR:	return MLX5_QP_STATE_RTR;
+	case IB_QPS_RTS:	return MLX5_QP_STATE_RTS;
+	case IB_QPS_SQD:	return MLX5_QP_STATE_SQD;
+	case IB_QPS_SQE:	return MLX5_QP_STATE_SQER;
+	case IB_QPS_ERR:	return MLX5_QP_STATE_ERR;
+	default:		return -1;
+	}
+}
+
+static int to_mlx5_st(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_RC:			return MLX5_QP_ST_RC;
+	case IB_QPT_UC:			return MLX5_QP_ST_UC;
+	case IB_QPT_UD:			return MLX5_QP_ST_UD;
+	case MLX5_IB_QPT_REG_UMR:	return MLX5_QP_ST_REG_UMR;
+	case IB_QPT_XRC_INI:
+	case IB_QPT_XRC_TGT:		return MLX5_QP_ST_XRC;
+	case IB_QPT_SMI:		return MLX5_QP_ST_QP0;
+	case IB_QPT_GSI:		return MLX5_QP_ST_QP1;
+	case IB_QPT_RAW_IPV6:		return MLX5_QP_ST_RAW_IPV6;
+	case IB_QPT_RAW_ETHERTYPE:	return MLX5_QP_ST_RAW_ETHERTYPE;
+	case IB_QPT_RAW_PACKET:
+	case IB_QPT_MAX:
+	default:		return -EINVAL;
+	}
+}
+
+static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn)
+{
+	return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index;
+}
+
+static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			  struct mlx5_ib_qp *qp, struct ib_udata *udata,
+			  struct mlx5_create_qp_mbox_in **in,
+			  struct mlx5_ib_create_qp_resp *resp, int *inlen)
+{
+	struct mlx5_ib_ucontext *context;
+	struct mlx5_ib_create_qp ucmd;
+	int page_shift;
+	int uar_index;
+	int npages;
+	u32 offset;
+	int uuarn;
+	int ncont;
+	int err;
+
+	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
+	if (err) {
+		mlx5_ib_dbg(dev, "copy failed\n");
+		return err;
+	}
+
+	context = to_mucontext(pd->uobject->context);
+	/*
+	 * TBD: should come from the verbs when we have the API
+	 */
+	uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH);
+	if (uuarn < 0) {
+		mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n");
+		mlx5_ib_dbg(dev, "reverting to high latency\n");
+		uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW);
+		if (uuarn < 0) {
+			mlx5_ib_dbg(dev, "uuar allocation failed\n");
+			return uuarn;
+		}
+	}
+
+	uar_index = uuarn_to_uar_index(&context->uuari, uuarn);
+	mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index);
+
+	err = set_user_buf_size(dev, qp, &ucmd);
+	if (err)
+		goto err_uuar;
+
+	qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+			       qp->buf_size, 0, 0);
+	if (IS_ERR(qp->umem)) {
+		mlx5_ib_dbg(dev, "umem_get failed\n");
+		err = PTR_ERR(qp->umem);
+		goto err_uuar;
+	}
+
+	mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift,
+			   &ncont, NULL);
+	err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset);
+	if (err) {
+		mlx5_ib_warn(dev, "bad offset\n");
+		goto err_umem;
+	}
+	mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n",
+		    ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset);
+
+	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;
+	*in = mlx5_vzalloc(*inlen);
+	if (!*in) {
+		err = -ENOMEM;
+		goto err_umem;
+	}
+	mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0);
+	(*in)->ctx.log_pg_sz_remote_qpn =
+		cpu_to_be32((page_shift - PAGE_SHIFT) << 24);
+	(*in)->ctx.params2 = cpu_to_be32(offset << 6);
+
+	(*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index);
+	resp->uuar_index = uuarn;
+	qp->uuarn = uuarn;
+
+	err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
+	if (err) {
+		mlx5_ib_dbg(dev, "map failed\n");
+		goto err_free;
+	}
+
+	err = ib_copy_to_udata(udata, resp, sizeof(*resp));
+	if (err) {
+		mlx5_ib_dbg(dev, "copy failed\n");
+		goto err_unmap;
+	}
+	qp->create_type = MLX5_QP_USER;
+
+	return 0;
+
+err_unmap:
+	mlx5_ib_db_unmap_user(context, &qp->db);
+
+err_free:
+	mlx5_vfree(*in);
+
+err_umem:
+	ib_umem_release(qp->umem);
+
+err_uuar:
+	free_uuar(&context->uuari, uuarn);
+	return err;
+}
+
+static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_ucontext *context;
+
+	context = to_mucontext(pd->uobject->context);
+	mlx5_ib_db_unmap_user(context, &qp->db);
+	ib_umem_release(qp->umem);
+	free_uuar(&context->uuari, qp->uuarn);
+}
+
+static int create_kernel_qp(struct mlx5_ib_dev *dev,
+			    struct ib_qp_init_attr *init_attr,
+			    struct mlx5_ib_qp *qp,
+			    struct mlx5_create_qp_mbox_in **in, int *inlen)
+{
+	enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW;
+	struct mlx5_uuar_info *uuari;
+	int uar_index;
+	int uuarn;
+	int err;
+
+	uuari = &dev->mdev.priv.uuari;
+	if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+		qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
+	if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)
+		lc = MLX5_IB_LATENCY_CLASS_FAST_PATH;
+
+	uuarn = alloc_uuar(uuari, lc);
+	if (uuarn < 0) {
+		mlx5_ib_dbg(dev, "\n");
+		return -ENOMEM;
+	}
+
+	qp->bf = &uuari->bfs[uuarn];
+	uar_index = qp->bf->uar->index;
+
+	err = calc_sq_size(dev, init_attr, qp);
+	if (err < 0) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		goto err_uuar;
+	}
+
+	qp->rq.offset = 0;
+	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+	qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
+
+	err = mlx5_buf_alloc(&dev->mdev, qp->buf_size, PAGE_SIZE * 2, &qp->buf);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		goto err_uuar;
+	}
+
+	qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt);
+	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * qp->buf.npages;
+	*in = mlx5_vzalloc(*inlen);
+	if (!*in) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+	(*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index);
+	(*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((qp->buf.page_shift - PAGE_SHIFT) << 24);
+	/* Set "fast registration enabled" for all kernel QPs */
+	(*in)->ctx.params1 |= cpu_to_be32(1 << 11);
+	(*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4);
+
+	mlx5_fill_page_array(&qp->buf, (*in)->pas);
+
+	err = mlx5_db_alloc(&dev->mdev, &qp->db);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		goto err_free;
+	}
+
+	qp->db.db[0] = 0;
+	qp->db.db[1] = 0;
+
+	qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL);
+	qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL);
+	qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL);
+	qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL);
+	qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL);
+
+	if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid ||
+	    !qp->sq.w_list || !qp->sq.wqe_head) {
+		err = -ENOMEM;
+		goto err_wrid;
+	}
+	qp->create_type = MLX5_QP_KERNEL;
+
+	return 0;
+
+err_wrid:
+	mlx5_db_free(&dev->mdev, &qp->db);
+	kfree(qp->sq.wqe_head);
+	kfree(qp->sq.w_list);
+	kfree(qp->sq.wrid);
+	kfree(qp->sq.wr_data);
+	kfree(qp->rq.wrid);
+
+err_free:
+	mlx5_vfree(*in);
+
+err_buf:
+	mlx5_buf_free(&dev->mdev, &qp->buf);
+
+err_uuar:
+	free_uuar(&dev->mdev.priv.uuari, uuarn);
+	return err;
+}
+
+static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+{
+	mlx5_db_free(&dev->mdev, &qp->db);
+	kfree(qp->sq.wqe_head);
+	kfree(qp->sq.w_list);
+	kfree(qp->sq.wrid);
+	kfree(qp->sq.wr_data);
+	kfree(qp->rq.wrid);
+	mlx5_buf_free(&dev->mdev, &qp->buf);
+	free_uuar(&dev->mdev.priv.uuari, qp->bf->uuarn);
+}
+
+static __be32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
+{
+	if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) ||
+	    (attr->qp_type == IB_QPT_XRC_INI))
+		return cpu_to_be32(MLX5_SRQ_RQ);
+	else if (!qp->has_rq)
+		return cpu_to_be32(MLX5_ZERO_LEN_RQ);
+	else
+		return cpu_to_be32(MLX5_NON_ZERO_RQ);
+}
+
+static int is_connected(enum ib_qp_type qp_type)
+{
+	if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC)
+		return 1;
+
+	return 0;
+}
+
+static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata, struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_resources *devr = &dev->devr;
+	struct mlx5_ib_create_qp_resp resp;
+	struct mlx5_create_qp_mbox_in *in;
+	struct mlx5_ib_create_qp ucmd;
+	int inlen = sizeof(*in);
+	int err;
+
+	mutex_init(&qp->mutex);
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
+
+	if (pd && pd->uobject) {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+			mlx5_ib_dbg(dev, "copy failed\n");
+			return -EFAULT;
+		}
+
+		qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE);
+		qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE);
+	} else {
+		qp->wq_sig = !!wq_signature;
+	}
+
+	qp->has_rq = qp_has_rq(init_attr);
+	err = set_rq_size(dev, &init_attr->cap, qp->has_rq,
+			  qp, (pd && pd->uobject) ? &ucmd : NULL);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		return err;
+	}
+
+	if (pd) {
+		if (pd->uobject) {
+			mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count);
+			if (ucmd.rq_wqe_shift != qp->rq.wqe_shift ||
+			    ucmd.rq_wqe_count != qp->rq.wqe_cnt) {
+				mlx5_ib_dbg(dev, "invalid rq params\n");
+				return -EINVAL;
+			}
+			if (ucmd.sq_wqe_count > dev->mdev.caps.max_wqes) {
+				mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n",
+					    ucmd.sq_wqe_count, dev->mdev.caps.max_wqes);
+				return -EINVAL;
+			}
+			err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen);
+			if (err)
+				mlx5_ib_dbg(dev, "err %d\n", err);
+		} else {
+			err = create_kernel_qp(dev, init_attr, qp, &in, &inlen);
+			if (err)
+				mlx5_ib_dbg(dev, "err %d\n", err);
+			else
+				qp->pa_lkey = to_mpd(pd)->pa_lkey;
+		}
+
+		if (err)
+			return err;
+	} else {
+		in = mlx5_vzalloc(sizeof(*in));
+		if (!in)
+			return -ENOMEM;
+
+		qp->create_type = MLX5_QP_EMPTY;
+	}
+
+	if (is_sqp(init_attr->qp_type))
+		qp->port = init_attr->port_num;
+
+	in->ctx.flags = cpu_to_be32(to_mlx5_st(init_attr->qp_type) << 16 |
+				    MLX5_QP_PM_MIGRATED << 11);
+
+	if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR)
+		in->ctx.flags_pd = cpu_to_be32(to_mpd(pd ? pd : devr->p0)->pdn);
+	else
+		in->ctx.flags_pd = cpu_to_be32(MLX5_QP_LAT_SENSITIVE);
+
+	if (qp->wq_sig)
+		in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG);
+
+	if (qp->scat_cqe && is_connected(init_attr->qp_type)) {
+		int rcqe_sz;
+		int scqe_sz;
+
+		rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq);
+		scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq);
+
+		if (rcqe_sz == 128)
+			in->ctx.cs_res = MLX5_RES_SCAT_DATA64_CQE;
+		else
+			in->ctx.cs_res = MLX5_RES_SCAT_DATA32_CQE;
+
+		if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) {
+			if (scqe_sz == 128)
+				in->ctx.cs_req = MLX5_REQ_SCAT_DATA64_CQE;
+			else
+				in->ctx.cs_req = MLX5_REQ_SCAT_DATA32_CQE;
+		}
+	}
+
+	if (qp->rq.wqe_cnt) {
+		in->ctx.rq_size_stride = (qp->rq.wqe_shift - 4);
+		in->ctx.rq_size_stride |= ilog2(qp->rq.wqe_cnt) << 3;
+	}
+
+	in->ctx.rq_type_srqn = get_rx_type(qp, init_attr);
+
+	if (qp->sq.wqe_cnt)
+		in->ctx.sq_crq_size |= cpu_to_be16(ilog2(qp->sq.wqe_cnt) << 11);
+	else
+		in->ctx.sq_crq_size |= cpu_to_be16(0x8000);
+
+	/* Set default resources */
+	switch (init_attr->qp_type) {
+	case IB_QPT_XRC_TGT:
+		in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn);
+		in->ctx.cqn_send = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn);
+		in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn);
+		in->ctx.xrcd = cpu_to_be32(to_mxrcd(init_attr->xrcd)->xrcdn);
+		break;
+	case IB_QPT_XRC_INI:
+		in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn);
+		in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn);
+		in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn);
+		break;
+	default:
+		if (init_attr->srq) {
+			in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x0)->xrcdn);
+			in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(init_attr->srq)->msrq.srqn);
+		} else {
+			in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn);
+			in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn);
+		}
+	}
+
+	if (init_attr->send_cq)
+		in->ctx.cqn_send = cpu_to_be32(to_mcq(init_attr->send_cq)->mcq.cqn);
+
+	if (init_attr->recv_cq)
+		in->ctx.cqn_recv = cpu_to_be32(to_mcq(init_attr->recv_cq)->mcq.cqn);
+
+	in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma);
+
+	err = mlx5_core_create_qp(&dev->mdev, &qp->mqp, in, inlen);
+	if (err) {
+		mlx5_ib_dbg(dev, "create qp failed\n");
+		goto err_create;
+	}
+
+	mlx5_vfree(in);
+	/* Hardware wants QPN written in big-endian order (after
+	 * shifting) for send doorbell.  Precompute this value to save
+	 * a little bit when posting sends.
+	 */
+	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+	qp->mqp.event = mlx5_ib_qp_event;
+
+	return 0;
+
+err_create:
+	if (qp->create_type == MLX5_QP_USER)
+		destroy_qp_user(pd, qp);
+	else if (qp->create_type == MLX5_QP_KERNEL)
+		destroy_qp_kernel(dev, qp);
+
+	mlx5_vfree(in);
+	return err;
+}
+
+static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq)
+	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
+{
+	if (send_cq) {
+		if (recv_cq) {
+			if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
+				spin_lock_irq(&send_cq->lock);
+				spin_lock_nested(&recv_cq->lock,
+						 SINGLE_DEPTH_NESTING);
+			} else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
+				spin_lock_irq(&send_cq->lock);
+				__acquire(&recv_cq->lock);
+			} else {
+				spin_lock_irq(&recv_cq->lock);
+				spin_lock_nested(&send_cq->lock,
+						 SINGLE_DEPTH_NESTING);
+			}
+		} else {
+			spin_lock_irq(&send_cq->lock);
+		}
+	} else if (recv_cq) {
+		spin_lock_irq(&recv_cq->lock);
+	}
+}
+
+static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq)
+	__releases(&send_cq->lock) __releases(&recv_cq->lock)
+{
+	if (send_cq) {
+		if (recv_cq) {
+			if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
+				spin_unlock(&recv_cq->lock);
+				spin_unlock_irq(&send_cq->lock);
+			} else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
+				__release(&recv_cq->lock);
+				spin_unlock_irq(&send_cq->lock);
+			} else {
+				spin_unlock(&send_cq->lock);
+				spin_unlock_irq(&recv_cq->lock);
+			}
+		} else {
+			spin_unlock_irq(&send_cq->lock);
+		}
+	} else if (recv_cq) {
+		spin_unlock_irq(&recv_cq->lock);
+	}
+}
+
+static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp)
+{
+	return to_mpd(qp->ibqp.pd);
+}
+
+static void get_cqs(struct mlx5_ib_qp *qp,
+		    struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq)
+{
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_XRC_TGT:
+		*send_cq = NULL;
+		*recv_cq = NULL;
+		break;
+	case MLX5_IB_QPT_REG_UMR:
+	case IB_QPT_XRC_INI:
+		*send_cq = to_mcq(qp->ibqp.send_cq);
+		*recv_cq = NULL;
+		break;
+
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	case IB_QPT_RAW_IPV6:
+	case IB_QPT_RAW_ETHERTYPE:
+		*send_cq = to_mcq(qp->ibqp.send_cq);
+		*recv_cq = to_mcq(qp->ibqp.recv_cq);
+		break;
+
+	case IB_QPT_RAW_PACKET:
+	case IB_QPT_MAX:
+	default:
+		*send_cq = NULL;
+		*recv_cq = NULL;
+		break;
+	}
+}
+
+static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_cq *send_cq, *recv_cq;
+	struct mlx5_modify_qp_mbox_in *in;
+	int err;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return;
+	if (qp->state != IB_QPS_RESET)
+		if (mlx5_core_qp_modify(&dev->mdev, to_mlx5_state(qp->state),
+					MLX5_QP_STATE_RST, in, sizeof(*in), &qp->mqp))
+			mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n",
+				     qp->mqp.qpn);
+
+	get_cqs(qp, &send_cq, &recv_cq);
+
+	if (qp->create_type == MLX5_QP_KERNEL) {
+		mlx5_ib_lock_cqs(send_cq, recv_cq);
+		__mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
+				   qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+		if (send_cq != recv_cq)
+			__mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+		mlx5_ib_unlock_cqs(send_cq, recv_cq);
+	}
+
+	err = mlx5_core_destroy_qp(&dev->mdev, &qp->mqp);
+	if (err)
+		mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", qp->mqp.qpn);
+	kfree(in);
+
+
+	if (qp->create_type == MLX5_QP_KERNEL)
+		destroy_qp_kernel(dev, qp);
+	else if (qp->create_type == MLX5_QP_USER)
+		destroy_qp_user(&get_pd(qp)->ibpd, qp);
+}
+
+static const char *ib_qp_type_str(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_SMI:
+		return "IB_QPT_SMI";
+	case IB_QPT_GSI:
+		return "IB_QPT_GSI";
+	case IB_QPT_RC:
+		return "IB_QPT_RC";
+	case IB_QPT_UC:
+		return "IB_QPT_UC";
+	case IB_QPT_UD:
+		return "IB_QPT_UD";
+	case IB_QPT_RAW_IPV6:
+		return "IB_QPT_RAW_IPV6";
+	case IB_QPT_RAW_ETHERTYPE:
+		return "IB_QPT_RAW_ETHERTYPE";
+	case IB_QPT_XRC_INI:
+		return "IB_QPT_XRC_INI";
+	case IB_QPT_XRC_TGT:
+		return "IB_QPT_XRC_TGT";
+	case IB_QPT_RAW_PACKET:
+		return "IB_QPT_RAW_PACKET";
+	case MLX5_IB_QPT_REG_UMR:
+		return "MLX5_IB_QPT_REG_UMR";
+	case IB_QPT_MAX:
+	default:
+		return "Invalid QP type";
+	}
+}
+
+struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev;
+	struct mlx5_ib_qp *qp;
+	u16 xrcdn = 0;
+	int err;
+
+	if (pd) {
+		dev = to_mdev(pd->device);
+	} else {
+		/* being cautious here */
+		if (init_attr->qp_type != IB_QPT_XRC_TGT &&
+		    init_attr->qp_type != MLX5_IB_QPT_REG_UMR) {
+			pr_warn("%s: no PD for transport %s\n", __func__,
+				ib_qp_type_str(init_attr->qp_type));
+			return ERR_PTR(-EINVAL);
+		}
+		dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
+	}
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_XRC_TGT:
+	case IB_QPT_XRC_INI:
+		if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_XRC)) {
+			mlx5_ib_dbg(dev, "XRC not supported\n");
+			return ERR_PTR(-ENOSYS);
+		}
+		init_attr->recv_cq = NULL;
+		if (init_attr->qp_type == IB_QPT_XRC_TGT) {
+			xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
+			init_attr->send_cq = NULL;
+		}
+
+		/* fall through */
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	case MLX5_IB_QPT_REG_UMR:
+		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		err = create_qp_common(dev, pd, init_attr, udata, qp);
+		if (err) {
+			mlx5_ib_dbg(dev, "create_qp_common failed\n");
+			kfree(qp);
+			return ERR_PTR(err);
+		}
+
+		if (is_qp0(init_attr->qp_type))
+			qp->ibqp.qp_num = 0;
+		else if (is_qp1(init_attr->qp_type))
+			qp->ibqp.qp_num = 1;
+		else
+			qp->ibqp.qp_num = qp->mqp.qpn;
+
+		mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n",
+			    qp->ibqp.qp_num, qp->mqp.qpn, to_mcq(init_attr->recv_cq)->mcq.cqn,
+			    to_mcq(init_attr->send_cq)->mcq.cqn);
+
+		qp->xrcdn = xrcdn;
+
+		break;
+
+	case IB_QPT_RAW_IPV6:
+	case IB_QPT_RAW_ETHERTYPE:
+	case IB_QPT_RAW_PACKET:
+	case IB_QPT_MAX:
+	default:
+		mlx5_ib_dbg(dev, "unsupported qp type %d\n",
+			    init_attr->qp_type);
+		/* Don't support raw QPs */
+		return ERR_PTR(-EINVAL);
+	}
+
+	return &qp->ibqp;
+}
+
+int mlx5_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+
+	destroy_qp_common(dev, mqp);
+
+	kfree(mqp);
+
+	return 0;
+}
+
+static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr,
+				   int attr_mask)
+{
+	u32 hw_access_flags = 0;
+	u8 dest_rd_atomic;
+	u32 access_flags;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MLX5_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX);
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MLX5_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
+enum {
+	MLX5_PATH_FLAG_FL	= 1 << 0,
+	MLX5_PATH_FLAG_FREE_AR	= 1 << 1,
+	MLX5_PATH_FLAG_COUNTER	= 1 << 2,
+};
+
+static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
+{
+	if (rate == IB_RATE_PORT_CURRENT) {
+		return 0;
+	} else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) {
+		return -EINVAL;
+	} else {
+		while (rate != IB_RATE_2_5_GBPS &&
+		       !(1 << (rate + MLX5_STAT_RATE_OFFSET) &
+			 dev->mdev.caps.stat_rate_support))
+			--rate;
+	}
+
+	return rate + MLX5_STAT_RATE_OFFSET;
+}
+
+static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
+			 struct mlx5_qp_path *path, u8 port, int attr_mask,
+			 u32 path_flags, const struct ib_qp_attr *attr)
+{
+	int err;
+
+	path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
+	path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0;
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		path->pkey_index = attr->pkey_index;
+
+	path->grh_mlid	= ah->src_path_bits & 0x7f;
+	path->rlid	= cpu_to_be16(ah->dlid);
+
+	if (ah->ah_flags & IB_AH_GRH) {
+		path->grh_mlid |= 1 << 7;
+		path->mgid_index = ah->grh.sgid_index;
+		path->hop_limit  = ah->grh.hop_limit;
+		path->tclass_flowlabel =
+			cpu_to_be32((ah->grh.traffic_class << 20) |
+				    (ah->grh.flow_label));
+		memcpy(path->rgid, ah->grh.dgid.raw, 16);
+	}
+
+	err = ib_rate_to_mlx5(dev, ah->static_rate);
+	if (err < 0)
+		return err;
+	path->static_rate = err;
+	path->port = port;
+
+	if (ah->ah_flags & IB_AH_GRH) {
+		if (ah->grh.sgid_index >= dev->mdev.caps.port[port - 1].gid_table_len) {
+			pr_err(KERN_ERR "sgid_index (%u) too large. max is %d\n",
+			       ah->grh.sgid_index, dev->mdev.caps.port[port - 1].gid_table_len);
+			return -EINVAL;
+		}
+
+		path->grh_mlid |= 1 << 7;
+		path->mgid_index = ah->grh.sgid_index;
+		path->hop_limit  = ah->grh.hop_limit;
+		path->tclass_flowlabel =
+			cpu_to_be32((ah->grh.traffic_class << 20) |
+				    (ah->grh.flow_label));
+		memcpy(path->rgid, ah->grh.dgid.raw, 16);
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT)
+		path->ackto_lt = attr->timeout << 3;
+
+	path->sl = ah->sl & 0xf;
+
+	return 0;
+}
+
+static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = {
+	[MLX5_QP_STATE_INIT] = {
+		[MLX5_QP_STATE_INIT] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_PRI_PORT,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_PRI_PORT,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_Q_KEY		|
+					  MLX5_QP_OPTPAR_PRI_PORT,
+		},
+		[MLX5_QP_STATE_RTR] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
+					  MLX5_QP_OPTPAR_RRE            |
+					  MLX5_QP_OPTPAR_RAE            |
+					  MLX5_QP_OPTPAR_RWE            |
+					  MLX5_QP_OPTPAR_PKEY_INDEX,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
+					  MLX5_QP_OPTPAR_RWE            |
+					  MLX5_QP_OPTPAR_PKEY_INDEX,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX     |
+					  MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX	|
+					   MLX5_QP_OPTPAR_Q_KEY,
+		},
+	},
+	[MLX5_QP_STATE_RTR] = {
+		[MLX5_QP_STATE_RTS] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH	|
+					  MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_RNR_TIMEOUT,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH	|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PM_STATE,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY,
+		},
+	},
+	[MLX5_QP_STATE_RTS] = {
+		[MLX5_QP_STATE_RTS] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_RNR_TIMEOUT	|
+					  MLX5_QP_OPTPAR_PM_STATE,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PM_STATE,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY		|
+					  MLX5_QP_OPTPAR_SRQN		|
+					  MLX5_QP_OPTPAR_CQN_RCV,
+		},
+	},
+	[MLX5_QP_STATE_SQER] = {
+		[MLX5_QP_STATE_RTS] = {
+			[MLX5_QP_ST_UD]	 = MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY,
+		},
+	},
+};
+
+static int ib_nr_to_mlx5_nr(int ib_mask)
+{
+	switch (ib_mask) {
+	case IB_QP_STATE:
+		return 0;
+	case IB_QP_CUR_STATE:
+		return 0;
+	case IB_QP_EN_SQD_ASYNC_NOTIFY:
+		return 0;
+	case IB_QP_ACCESS_FLAGS:
+		return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE |
+			MLX5_QP_OPTPAR_RAE;
+	case IB_QP_PKEY_INDEX:
+		return MLX5_QP_OPTPAR_PKEY_INDEX;
+	case IB_QP_PORT:
+		return MLX5_QP_OPTPAR_PRI_PORT;
+	case IB_QP_QKEY:
+		return MLX5_QP_OPTPAR_Q_KEY;
+	case IB_QP_AV:
+		return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH |
+			MLX5_QP_OPTPAR_PRI_PORT;
+	case IB_QP_PATH_MTU:
+		return 0;
+	case IB_QP_TIMEOUT:
+		return MLX5_QP_OPTPAR_ACK_TIMEOUT;
+	case IB_QP_RETRY_CNT:
+		return MLX5_QP_OPTPAR_RETRY_COUNT;
+	case IB_QP_RNR_RETRY:
+		return MLX5_QP_OPTPAR_RNR_RETRY;
+	case IB_QP_RQ_PSN:
+		return 0;
+	case IB_QP_MAX_QP_RD_ATOMIC:
+		return MLX5_QP_OPTPAR_SRA_MAX;
+	case IB_QP_ALT_PATH:
+		return MLX5_QP_OPTPAR_ALT_ADDR_PATH;
+	case IB_QP_MIN_RNR_TIMER:
+		return MLX5_QP_OPTPAR_RNR_TIMEOUT;
+	case IB_QP_SQ_PSN:
+		return 0;
+	case IB_QP_MAX_DEST_RD_ATOMIC:
+		return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE |
+			MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE;
+	case IB_QP_PATH_MIG_STATE:
+		return MLX5_QP_OPTPAR_PM_STATE;
+	case IB_QP_CAP:
+		return 0;
+	case IB_QP_DEST_QPN:
+		return 0;
+	}
+	return 0;
+}
+
+static int ib_mask_to_mlx5_opt(int ib_mask)
+{
+	int result = 0;
+	int i;
+
+	for (i = 0; i < 8 * sizeof(int); i++) {
+		if ((1 << i) & ib_mask)
+			result |= ib_nr_to_mlx5_nr(1 << i);
+	}
+
+	return result;
+}
+
+static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
+			       const struct ib_qp_attr *attr, int attr_mask,
+			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_ib_cq *send_cq, *recv_cq;
+	struct mlx5_qp_context *context;
+	struct mlx5_modify_qp_mbox_in *in;
+	struct mlx5_ib_pd *pd;
+	enum mlx5_qp_state mlx5_cur, mlx5_new;
+	enum mlx5_qp_optpar optpar;
+	int sqd_event;
+	int mlx5_st;
+	int err;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	context = &in->ctx;
+	err = to_mlx5_st(ibqp->qp_type);
+	if (err < 0)
+		goto out;
+
+	context->flags = cpu_to_be32(err << 16);
+
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE)) {
+		context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
+	} else {
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) {
+		context->mtu_msgmax = (IB_MTU_256 << 5) | 8;
+	} else if (ibqp->qp_type == IB_QPT_UD ||
+		   ibqp->qp_type == MLX5_IB_QPT_REG_UMR) {
+		context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
+	} else if (attr_mask & IB_QP_PATH_MTU) {
+		if (attr->path_mtu < IB_MTU_256 ||
+		    attr->path_mtu > IB_MTU_4096) {
+			mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu);
+			err = -EINVAL;
+			goto out;
+		}
+		context->mtu_msgmax = (attr->path_mtu << 5) | dev->mdev.caps.log_max_msg;
+	}
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num);
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		context->pri_path.pkey_index = attr->pkey_index;
+
+	/* todo implement counter_index functionality */
+
+	if (is_sqp(ibqp->qp_type))
+		context->pri_path.port = qp->port;
+
+	if (attr_mask & IB_QP_PORT)
+		context->pri_path.port = attr->port_num;
+
+	if (attr_mask & IB_QP_AV) {
+		err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path,
+				    attr_mask & IB_QP_PORT ? attr->port_num : qp->port,
+				    attr_mask, 0, attr);
+		if (err)
+			goto out;
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT)
+		context->pri_path.ackto_lt |= attr->timeout << 3;
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
+				    attr->alt_port_num, attr_mask, 0, attr);
+		if (err)
+			goto out;
+	}
+
+	pd = get_pd(qp);
+	get_cqs(qp, &send_cq, &recv_cq);
+
+	context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn);
+	context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0;
+	context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0;
+	context->params1  = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28);
+
+	if (attr_mask & IB_QP_RNR_RETRY)
+		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+
+	if (attr_mask & IB_QP_RETRY_CNT)
+		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic)
+			context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		context->next_send_psn = cpu_to_be32(attr->sq_psn);
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic)
+			context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC))
+		context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+
+	if (attr_mask & IB_QP_RQ_PSN)
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	if (attr_mask & IB_QP_QKEY)
+		context->qkey = cpu_to_be32(attr->qkey);
+
+	if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->db_rec_addr = cpu_to_be64(qp->db.dma);
+
+	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
+	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
+		sqd_event = 1;
+	else
+		sqd_event = 0;
+
+	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->sq_crq_size |= cpu_to_be16(1 << 4);
+
+
+	mlx5_cur = to_mlx5_state(cur_state);
+	mlx5_new = to_mlx5_state(new_state);
+	mlx5_st = to_mlx5_st(ibqp->qp_type);
+	if (mlx5_cur < 0 || mlx5_new < 0 || mlx5_st < 0)
+		goto out;
+
+	optpar = ib_mask_to_mlx5_opt(attr_mask);
+	optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
+	in->optparam = cpu_to_be32(optpar);
+	err = mlx5_core_qp_modify(&dev->mdev, to_mlx5_state(cur_state),
+				  to_mlx5_state(new_state), in, sqd_event,
+				  &qp->mqp);
+	if (err)
+		goto out;
+
+	qp->state = new_state;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->atomic_rd_en = attr->qp_access_flags;
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT)
+		qp->port = attr->port_num;
+	if (attr_mask & IB_QP_ALT_PATH)
+		qp->alt_port = attr->alt_port_num;
+
+	/*
+	 * If we moved a kernel QP to RESET, clean up all old CQ
+	 * entries and reinitialize the QP.
+	 */
+	if (new_state == IB_QPS_RESET && !ibqp->uobject) {
+		mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
+				 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+		if (send_cq != recv_cq)
+			mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+
+		qp->rq.head = 0;
+		qp->rq.tail = 0;
+		qp->sq.head = 0;
+		qp->sq.tail = 0;
+		qp->sq.cur_post = 0;
+		qp->sq.last_poll = 0;
+		qp->db.db[MLX5_RCV_DBR] = 0;
+		qp->db.db[MLX5_SND_DBR] = 0;
+	}
+
+out:
+	kfree(in);
+	return err;
+}
+
+int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int err = -EINVAL;
+	int port;
+
+	mutex_lock(&qp->mutex);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR &&
+	    !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask))
+		goto out;
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->mdev.caps.num_ports))
+		goto out;
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		if (attr->pkey_index >= dev->mdev.caps.port[port - 1].pkey_table_len)
+			goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->mdev.caps.max_ra_res_qp)
+		goto out;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > dev->mdev.caps.max_ra_req_qp)
+		goto out;
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+	struct mlx5_ib_cq *cq;
+	unsigned cur;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max_post))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max_post;
+}
+
+static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+static void set_atomic_seg(struct mlx5_wqe_atomic_seg *aseg, struct ib_send_wr *wr)
+{
+	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add);
+	} else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add_mask);
+	} else {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = 0;
+	}
+}
+
+static void set_masked_atomic_seg(struct mlx5_wqe_masked_atomic_seg *aseg,
+				  struct ib_send_wr *wr)
+{
+	aseg->swap_add		= cpu_to_be64(wr->wr.atomic.swap);
+	aseg->swap_add_mask	= cpu_to_be64(wr->wr.atomic.swap_mask);
+	aseg->compare		= cpu_to_be64(wr->wr.atomic.compare_add);
+	aseg->compare_mask	= cpu_to_be64(wr->wr.atomic.compare_add_mask);
+}
+
+static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,
+			     struct ib_send_wr *wr)
+{
+	memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof(struct mlx5_av));
+	dseg->av.dqp_dct = cpu_to_be32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV);
+	dseg->av.key.qkey.qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+}
+
+static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static __be16 get_klm_octo(int npages)
+{
+	return cpu_to_be16(ALIGN(npages, 8) / 2);
+}
+
+static __be64 frwr_mkey_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LEN		|
+		MLX5_MKEY_MASK_PAGE_SIZE	|
+		MLX5_MKEY_MASK_START_ADDR	|
+		MLX5_MKEY_MASK_EN_RINVAL	|
+		MLX5_MKEY_MASK_KEY		|
+		MLX5_MKEY_MASK_LR		|
+		MLX5_MKEY_MASK_LW		|
+		MLX5_MKEY_MASK_RR		|
+		MLX5_MKEY_MASK_RW		|
+		MLX5_MKEY_MASK_A		|
+		MLX5_MKEY_MASK_SMALL_FENCE	|
+		MLX5_MKEY_MASK_FREE;
+
+	return cpu_to_be64(result);
+}
+
+static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
+				 struct ib_send_wr *wr, int li)
+{
+	memset(umr, 0, sizeof(*umr));
+
+	if (li) {
+		umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
+		umr->flags = 1 << 7;
+		return;
+	}
+
+	umr->flags = (1 << 5); /* fail if not free */
+	umr->klm_octowords = get_klm_octo(wr->wr.fast_reg.page_list_len);
+	umr->mkey_mask = frwr_mkey_mask();
+}
+
+static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
+				struct ib_send_wr *wr)
+{
+	struct umr_wr *umrwr = (struct umr_wr *)&wr->wr.fast_reg;
+	u64 mask;
+
+	memset(umr, 0, sizeof(*umr));
+
+	if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) {
+		umr->flags = 1 << 5; /* fail if not free */
+		umr->klm_octowords = get_klm_octo(umrwr->npages);
+		mask =  MLX5_MKEY_MASK_LEN		|
+			MLX5_MKEY_MASK_PAGE_SIZE	|
+			MLX5_MKEY_MASK_START_ADDR	|
+			MLX5_MKEY_MASK_PD		|
+			MLX5_MKEY_MASK_LR		|
+			MLX5_MKEY_MASK_LW		|
+			MLX5_MKEY_MASK_RR		|
+			MLX5_MKEY_MASK_RW		|
+			MLX5_MKEY_MASK_A		|
+			MLX5_MKEY_MASK_FREE;
+		umr->mkey_mask = cpu_to_be64(mask);
+	} else {
+		umr->flags = 2 << 5; /* fail if free */
+		mask = MLX5_MKEY_MASK_FREE;
+		umr->mkey_mask = cpu_to_be64(mask);
+	}
+
+	if (!wr->num_sge)
+		umr->flags |= (1 << 7); /* inline */
+}
+
+static u8 get_umr_flags(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
+		MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT;
+}
+
+static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr,
+			     int li, int *writ)
+{
+	memset(seg, 0, sizeof(*seg));
+	if (li) {
+		seg->status = 1 << 6;
+		return;
+	}
+
+	seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags);
+	*writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE);
+	seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00);
+	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL);
+	seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
+	seg->len = cpu_to_be64(wr->wr.fast_reg.length);
+	seg->xlt_oct_size = cpu_to_be32((wr->wr.fast_reg.page_list_len + 1) / 2);
+	seg->log2_page_size = wr->wr.fast_reg.page_shift;
+}
+
+static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr)
+{
+	memset(seg, 0, sizeof(*seg));
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) {
+		seg->status = 1 << 6;
+		return;
+	}
+
+	seg->flags = convert_access(wr->wr.fast_reg.access_flags);
+	seg->flags_pd = cpu_to_be32(to_mpd((struct ib_pd *)wr->wr.fast_reg.page_list)->pdn);
+	seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
+	seg->len = cpu_to_be64(wr->wr.fast_reg.length);
+	seg->log2_page_size = wr->wr.fast_reg.page_shift;
+	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+}
+
+static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg,
+			   struct ib_send_wr *wr,
+			   struct mlx5_core_dev *mdev,
+			   struct mlx5_ib_pd *pd,
+			   int writ)
+{
+	struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list);
+	u64 *page_list = wr->wr.fast_reg.page_list->page_list;
+	u64 perm = MLX5_EN_RD | (writ ? MLX5_EN_WR : 0);
+	int i;
+
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; i++)
+		mfrpl->mapped_page_list[i] = cpu_to_be64(page_list[i] | perm);
+	dseg->addr = cpu_to_be64(mfrpl->map);
+	dseg->byte_count = cpu_to_be32(ALIGN(sizeof(u64) * wr->wr.fast_reg.page_list_len, 64));
+	dseg->lkey = cpu_to_be32(pd->pa_lkey);
+}
+
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return wr->ex.imm_data;
+
+	case IB_WR_SEND_WITH_INV:
+		return cpu_to_be32(wr->ex.invalidate_rkey);
+
+	default:
+		return 0;
+	}
+}
+
+static u8 calc_sig(void *wqe, int size)
+{
+	u8 *p = wqe;
+	u8 res = 0;
+	int i;
+
+	for (i = 0; i < size; i++)
+		res ^= p[i];
+
+	return ~res;
+}
+
+static u8 wq_sig(void *wqe)
+{
+	return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4);
+}
+
+static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr,
+			    void *wqe, int *sz)
+{
+	struct mlx5_wqe_inline_seg *seg;
+	void *qend = qp->sq.qend;
+	void *addr;
+	int inl = 0;
+	int copy;
+	int len;
+	int i;
+
+	seg = wqe;
+	wqe += sizeof(*seg);
+	for (i = 0; i < wr->num_sge; i++) {
+		addr = (void *)(unsigned long)(wr->sg_list[i].addr);
+		len  = wr->sg_list[i].length;
+		inl += len;
+
+		if (unlikely(inl > qp->max_inline_data))
+			return -ENOMEM;
+
+		if (unlikely(wqe + len > qend)) {
+			copy = qend - wqe;
+			memcpy(wqe, addr, copy);
+			addr += copy;
+			len -= copy;
+			wqe = mlx5_get_send_wqe(qp, 0);
+		}
+		memcpy(wqe, addr, len);
+		wqe += len;
+	}
+
+	seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG);
+
+	*sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16;
+
+	return 0;
+}
+
+static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size,
+			  struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp)
+{
+	int writ = 0;
+	int li;
+
+	li = wr->opcode == IB_WR_LOCAL_INV ? 1 : 0;
+	if (unlikely(wr->send_flags & IB_SEND_INLINE))
+		return -EINVAL;
+
+	set_frwr_umr_segment(*seg, wr, li);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+	set_mkey_segment(*seg, wr, li, &writ);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+	if (!li) {
+		set_frwr_pages(*seg, wr, mdev, pd, writ);
+		*seg += sizeof(struct mlx5_wqe_data_seg);
+		*size += (sizeof(struct mlx5_wqe_data_seg) / 16);
+	}
+	return 0;
+}
+
+static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16)
+{
+	__be32 *p = NULL;
+	int tidx = idx;
+	int i, j;
+
+	pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx));
+	for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
+		if ((i & 0xf) == 0) {
+			void *buf = mlx5_get_send_wqe(qp, tidx);
+			tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1);
+			p = buf;
+			j = 0;
+		}
+		pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]),
+			 be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]),
+			 be32_to_cpu(p[j + 3]));
+	}
+}
+
+static void mlx5_bf_copy(u64 __iomem *dst, u64 *src,
+			 unsigned bytecnt, struct mlx5_ib_qp *qp)
+{
+	while (bytecnt > 0) {
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		bytecnt -= 64;
+		if (unlikely(src == qp->sq.qend))
+			src = mlx5_get_send_wqe(qp, 0);
+	}
+}
+
+static u8 get_fence(u8 fence, struct ib_send_wr *wr)
+{
+	if (unlikely(wr->opcode == IB_WR_LOCAL_INV &&
+		     wr->send_flags & IB_SEND_FENCE))
+		return MLX5_FENCE_MODE_STRONG_ORDERING;
+
+	if (unlikely(fence)) {
+		if (wr->send_flags & IB_SEND_FENCE)
+			return MLX5_FENCE_MODE_SMALL_AND_FENCE;
+		else
+			return fence;
+
+	} else {
+		return 0;
+	}
+}
+
+int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_core_dev *mdev = &dev->mdev;
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_wqe_data_seg *dpseg;
+	struct mlx5_wqe_xrc_seg *xrc;
+	struct mlx5_bf *bf = qp->bf;
+	int uninitialized_var(size);
+	void *qend = qp->sq.qend;
+	unsigned long flags;
+	u32 mlx5_opcode;
+	unsigned idx;
+	int err = 0;
+	int inl = 0;
+	int num_sge;
+	void *seg;
+	int nreq;
+	int i;
+	u8 next_fence = 0;
+	u8 opmod = 0;
+	u8 fence;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (unlikely(wr->opcode >= sizeof(mlx5_ib_opcode) / sizeof(mlx5_ib_opcode[0]))) {
+			mlx5_ib_warn(dev, "\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) {
+			mlx5_ib_warn(dev, "\n");
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		fence = qp->fm_cache;
+		num_sge = wr->num_sge;
+		if (unlikely(num_sge > qp->sq.max_gs)) {
+			mlx5_ib_warn(dev, "\n");
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
+		seg = mlx5_get_send_wqe(qp, idx);
+		ctrl = seg;
+		*(uint32_t *)(seg + 8) = 0;
+		ctrl->imm = send_ieth(wr);
+		ctrl->fm_ce_se = qp->sq_signal_bits |
+			(wr->send_flags & IB_SEND_SIGNALED ?
+			 MLX5_WQE_CTRL_CQ_UPDATE : 0) |
+			(wr->send_flags & IB_SEND_SOLICITED ?
+			 MLX5_WQE_CTRL_SOLICITED : 0);
+
+		seg += sizeof(*ctrl);
+		size = sizeof(*ctrl) / 16;
+
+		switch (ibqp->qp_type) {
+		case IB_QPT_XRC_INI:
+			xrc = seg;
+			xrc->xrc_srqn = htonl(wr->xrc_remote_srq_num);
+			seg += sizeof(*xrc);
+			size += sizeof(*xrc) / 16;
+			/* fall through */
+		case IB_QPT_RC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(seg, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				seg  += sizeof(struct mlx5_wqe_raddr_seg);
+				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
+				break;
+
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(seg, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				seg  += sizeof(struct mlx5_wqe_raddr_seg);
+
+				set_atomic_seg(seg, wr);
+				seg  += sizeof(struct mlx5_wqe_atomic_seg);
+
+				size += (sizeof(struct mlx5_wqe_raddr_seg) +
+					 sizeof(struct mlx5_wqe_atomic_seg)) / 16;
+				break;
+
+			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+				set_raddr_seg(seg, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				seg  += sizeof(struct mlx5_wqe_raddr_seg);
+
+				set_masked_atomic_seg(seg, wr);
+				seg  += sizeof(struct mlx5_wqe_masked_atomic_seg);
+
+				size += (sizeof(struct mlx5_wqe_raddr_seg) +
+					 sizeof(struct mlx5_wqe_masked_atomic_seg)) / 16;
+				break;
+
+			case IB_WR_LOCAL_INV:
+				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+				qp->sq.wr_data[idx] = IB_WR_LOCAL_INV;
+				ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey);
+				err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				num_sge = 0;
+				break;
+
+			case IB_WR_FAST_REG_MR:
+				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+				qp->sq.wr_data[idx] = IB_WR_FAST_REG_MR;
+				ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey);
+				err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				num_sge = 0;
+				break;
+
+			default:
+				break;
+			}
+			break;
+
+		case IB_QPT_UC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(seg, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				seg  += sizeof(struct mlx5_wqe_raddr_seg);
+				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
+				break;
+
+			default:
+				break;
+			}
+			break;
+
+		case IB_QPT_UD:
+		case IB_QPT_SMI:
+		case IB_QPT_GSI:
+			set_datagram_seg(seg, wr);
+			seg  += sizeof(struct mlx5_wqe_datagram_seg);
+			size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
+			if (unlikely((seg == qend)))
+				seg = mlx5_get_send_wqe(qp, 0);
+			break;
+
+		case MLX5_IB_QPT_REG_UMR:
+			if (wr->opcode != MLX5_IB_WR_UMR) {
+				err = -EINVAL;
+				mlx5_ib_warn(dev, "bad opcode\n");
+				goto out;
+			}
+			qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
+			ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey);
+			set_reg_umr_segment(seg, wr);
+			seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+			size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+			if (unlikely((seg == qend)))
+				seg = mlx5_get_send_wqe(qp, 0);
+			set_reg_mkey_segment(seg, wr);
+			seg += sizeof(struct mlx5_mkey_seg);
+			size += sizeof(struct mlx5_mkey_seg) / 16;
+			if (unlikely((seg == qend)))
+				seg = mlx5_get_send_wqe(qp, 0);
+			break;
+
+		default:
+			break;
+		}
+
+		if (wr->send_flags & IB_SEND_INLINE && num_sge) {
+			int uninitialized_var(sz);
+
+			err = set_data_inl_seg(qp, wr, seg, &sz);
+			if (unlikely(err)) {
+				mlx5_ib_warn(dev, "\n");
+				*bad_wr = wr;
+				goto out;
+			}
+			inl = 1;
+			size += sz;
+		} else {
+			dpseg = seg;
+			for (i = 0; i < num_sge; i++) {
+				if (unlikely(dpseg == qend)) {
+					seg = mlx5_get_send_wqe(qp, 0);
+					dpseg = seg;
+				}
+				if (likely(wr->sg_list[i].length)) {
+					set_data_ptr_seg(dpseg, wr->sg_list + i);
+					size += sizeof(struct mlx5_wqe_data_seg) / 16;
+					dpseg++;
+				}
+			}
+		}
+
+		mlx5_opcode = mlx5_ib_opcode[wr->opcode];
+		ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8)	|
+						     mlx5_opcode			|
+						     ((u32)opmod << 24));
+		ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8));
+		ctrl->fm_ce_se |= get_fence(fence, wr);
+		qp->fm_cache = next_fence;
+		if (unlikely(qp->wq_sig))
+			ctrl->signature = wq_sig(ctrl);
+
+		qp->sq.wrid[idx] = wr->wr_id;
+		qp->sq.w_list[idx].opcode = mlx5_opcode;
+		qp->sq.wqe_head[idx] = qp->sq.head + nreq;
+		qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
+		qp->sq.w_list[idx].next = qp->sq.cur_post;
+
+		if (0)
+			dump_wqe(qp, idx, size);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->sq.head += nreq;
+
+		/* Make sure that descriptors are written before
+		 * updating doorbell record and ringing the doorbell
+		 */
+		wmb();
+
+		qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);
+
+		if (bf->need_lock)
+			spin_lock(&bf->lock);
+
+		/* TBD enable WC */
+		if (0 && nreq == 1 && bf->uuarn && inl && size > 1 && size <= bf->buf_size / 16) {
+			mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp);
+			/* wc_wmb(); */
+		} else {
+			mlx5_write64((__be32 *)ctrl, bf->regreg + bf->offset,
+				     MLX5_GET_DOORBELL_LOCK(&bf->lock32));
+			/* Make sure doorbells don't leak out of SQ spinlock
+			 * and reach the HCA out of order.
+			 */
+			mmiowb();
+		}
+		bf->offset ^= bf->buf_size;
+		if (bf->need_lock)
+			spin_unlock(&bf->lock);
+	}
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+	return err;
+}
+
+static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size)
+{
+	sig->signature = calc_sig(sig, size);
+}
+
+int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_wqe_data_seg *scat;
+	struct mlx5_rwqe_sig *sig;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int i;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		scat = get_recv_wqe(qp, ind);
+		if (qp->wq_sig)
+			scat++;
+
+		for (i = 0; i < wr->num_sge; i++)
+			set_data_ptr_seg(scat + i, wr->sg_list + i);
+
+		if (i < qp->rq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX5_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+
+		if (qp->wq_sig) {
+			sig = (struct mlx5_rwqe_sig *)scat;
+			set_sig_seg(sig, (qp->rq.max_gs + 1) << 2);
+		}
+
+		qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/* Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	return err;
+}
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state)
+{
+	switch (mlx5_state) {
+	case MLX5_QP_STATE_RST:      return IB_QPS_RESET;
+	case MLX5_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MLX5_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MLX5_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MLX5_QP_STATE_SQ_DRAINING:
+	case MLX5_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MLX5_QP_STATE_SQER:     return IB_QPS_SQE;
+	case MLX5_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:		     return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state)
+{
+	switch (mlx5_mig_state) {
+	case MLX5_QP_PM_ARMED:		return IB_MIG_ARMED;
+	case MLX5_QP_PM_REARM:		return IB_MIG_REARM;
+	case MLX5_QP_PM_MIGRATED:	return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mlx5_flags)
+{
+	int ib_flags = 0;
+
+	if (mlx5_flags & MLX5_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mlx5_flags & MLX5_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mlx5_flags & MLX5_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr,
+				struct mlx5_qp_path *path)
+{
+	struct mlx5_core_dev *dev = &ibdev->mdev;
+
+	memset(ib_ah_attr, 0, sizeof(*ib_ah_attr));
+	ib_ah_attr->port_num	  = path->port;
+
+	if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports)
+		return;
+
+	ib_ah_attr->sl = path->sl & 0xf;
+
+	ib_ah_attr->dlid	  = be16_to_cpu(path->rlid);
+	ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f;
+	ib_ah_attr->static_rate   = path->static_rate ? path->static_rate - 5 : 0;
+	ib_ah_attr->ah_flags      = (path->grh_mlid & (1 << 7)) ? IB_AH_GRH : 0;
+	if (ib_ah_attr->ah_flags) {
+		ib_ah_attr->grh.sgid_index = path->mgid_index;
+		ib_ah_attr->grh.hop_limit  = path->hop_limit;
+		ib_ah_attr->grh.traffic_class =
+			(be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
+		ib_ah_attr->grh.flow_label =
+			be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
+		memcpy(ib_ah_attr->grh.dgid.raw,
+		       path->rgid, sizeof(ib_ah_attr->grh.dgid.raw));
+	}
+}
+
+int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_query_qp_mbox_out *outb;
+	struct mlx5_qp_context *context;
+	int mlx5_state;
+	int err = 0;
+
+	mutex_lock(&qp->mutex);
+	outb = kzalloc(sizeof(*outb), GFP_KERNEL);
+	if (!outb) {
+		err = -ENOMEM;
+		goto out;
+	}
+	context = &outb->ctx;
+	err = mlx5_core_qp_query(&dev->mdev, &qp->mqp, outb, sizeof(*outb));
+	if (err)
+		goto out_free;
+
+	mlx5_state = be32_to_cpu(context->flags) >> 28;
+
+	qp->state		     = to_ib_qp_state(mlx5_state);
+	qp_attr->qp_state	     = qp->state;
+	qp_attr->path_mtu	     = context->mtu_msgmax >> 5;
+	qp_attr->path_mig_state	     =
+		to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
+	qp_attr->qkey		     = be32_to_cpu(context->qkey);
+	qp_attr->rq_psn		     = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn		     = be32_to_cpu(context->next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num	     = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context->params2));
+
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
+		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
+		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
+		qp_attr->alt_pkey_index = context->alt_path.pkey_index & 0x7f;
+		qp_attr->alt_port_num	= qp_attr->alt_ah_attr.port_num;
+	}
+
+	qp_attr->pkey_index = context->pri_path.pkey_index & 0x7f;
+	qp_attr->port_num = context->pri_path.port;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context->params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer	    =
+		(be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout	    = context->pri_path.ackto_lt >> 3;
+	qp_attr->retry_cnt	    = (be32_to_cpu(context->params1) >> 16) & 0x7;
+	qp_attr->rnr_retry	    = (be32_to_cpu(context->params1) >> 13) & 0x7;
+	qp_attr->alt_timeout	    = context->alt_path.ackto_lt >> 3;
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
+	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+
+	if (!ibqp->uobject) {
+		qp_attr->cap.max_send_wr  = qp->sq.wqe_cnt;
+		qp_attr->cap.max_send_sge = qp->sq.max_gs;
+	} else {
+		qp_attr->cap.max_send_wr  = 0;
+		qp_attr->cap.max_send_sge = 0;
+	}
+
+	/* We don't support inline sends for kernel QPs (yet), and we
+	 * don't know what userspace's value should be.
+	 */
+	qp_attr->cap.max_inline_data = 0;
+
+	qp_init_attr->cap	     = qp_attr->cap;
+
+	qp_init_attr->create_flags = 0;
+	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ?
+		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+
+out_free:
+	kfree(outb);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
+					  struct ib_ucontext *context,
+					  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_xrcd *xrcd;
+	int err;
+
+	if (!(dev->mdev.caps.flags & MLX5_DEV_CAP_FLAG_XRC))
+		return ERR_PTR(-ENOSYS);
+
+	xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL);
+	if (!xrcd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_core_xrcd_alloc(&dev->mdev, &xrcd->xrcdn);
+	if (err) {
+		kfree(xrcd);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return &xrcd->ibxrcd;
+}
+
+int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	struct mlx5_ib_dev *dev = to_mdev(xrcd->device);
+	u32 xrcdn = to_mxrcd(xrcd)->xrcdn;
+	int err;
+
+	err = mlx5_core_xrcd_dealloc(&dev->mdev, xrcdn);
+	if (err) {
+		mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn);
+		return err;
+	}
+
+	kfree(xrcd);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
new file mode 100644
index 000000000000..84d297afd6a9
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/srq.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+
+#include "mlx5_ib.h"
+#include "user.h"
+
+/* not supported currently */
+static int srq_signature;
+
+static void *get_wqe(struct mlx5_ib_srq *srq, int n)
+{
+	return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
+}
+
+static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type)
+{
+	struct ib_event event;
+	struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq;
+
+	if (ibsrq->event_handler) {
+		event.device      = ibsrq->device;
+		event.element.srq = ibsrq;
+		switch (type) {
+		case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+			event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			break;
+		case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+			event.event = IB_EVENT_SRQ_ERR;
+			break;
+		default:
+			pr_warn("mlx5_ib: Unexpected event type %d on SRQ %06x\n",
+				type, srq->srqn);
+			return;
+		}
+
+		ibsrq->event_handler(&event, ibsrq->srq_context);
+	}
+}
+
+static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
+			   struct mlx5_create_srq_mbox_in **in,
+			   struct ib_udata *udata, int buf_size, int *inlen)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_create_srq ucmd;
+	int err;
+	int npages;
+	int page_shift;
+	int ncont;
+	u32 offset;
+
+	if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+		mlx5_ib_dbg(dev, "failed copy udata\n");
+		return -EFAULT;
+	}
+	srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
+
+	srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size,
+				0, 0);
+	if (IS_ERR(srq->umem)) {
+		mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size);
+		err = PTR_ERR(srq->umem);
+		return err;
+	}
+
+	mlx5_ib_cont_pages(srq->umem, ucmd.buf_addr, &npages,
+			   &page_shift, &ncont, NULL);
+	err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift,
+				     &offset);
+	if (err) {
+		mlx5_ib_warn(dev, "bad offset\n");
+		goto err_umem;
+	}
+
+	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;
+	*in = mlx5_vzalloc(*inlen);
+	if (!(*in)) {
+		err = -ENOMEM;
+		goto err_umem;
+	}
+
+	mlx5_ib_populate_pas(dev, srq->umem, page_shift, (*in)->pas, 0);
+
+	err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context),
+				  ucmd.db_addr, &srq->db);
+	if (err) {
+		mlx5_ib_dbg(dev, "map doorbell failed\n");
+		goto err_in;
+	}
+
+	(*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT;
+	(*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);
+
+	return 0;
+
+err_in:
+	mlx5_vfree(*in);
+
+err_umem:
+	ib_umem_release(srq->umem);
+
+	return err;
+}
+
+static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
+			     struct mlx5_create_srq_mbox_in **in, int buf_size,
+			     int *inlen)
+{
+	int err;
+	int i;
+	struct mlx5_wqe_srq_next_seg *next;
+	int page_shift;
+	int npages;
+
+	err = mlx5_db_alloc(&dev->mdev, &srq->db);
+	if (err) {
+		mlx5_ib_warn(dev, "alloc dbell rec failed\n");
+		return err;
+	}
+
+	*srq->db.db = 0;
+
+	if (mlx5_buf_alloc(&dev->mdev, buf_size, PAGE_SIZE * 2, &srq->buf)) {
+		mlx5_ib_dbg(dev, "buf alloc failed\n");
+		err = -ENOMEM;
+		goto err_db;
+	}
+	page_shift = srq->buf.page_shift;
+
+	srq->head    = 0;
+	srq->tail    = srq->msrq.max - 1;
+	srq->wqe_ctr = 0;
+
+	for (i = 0; i < srq->msrq.max; i++) {
+		next = get_wqe(srq, i);
+		next->next_wqe_index =
+			cpu_to_be16((i + 1) & (srq->msrq.max - 1));
+	}
+
+	npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT));
+	mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n",
+		    buf_size, page_shift, srq->buf.npages, npages);
+	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * npages;
+	*in = mlx5_vzalloc(*inlen);
+	if (!*in) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+	mlx5_fill_page_array(&srq->buf, (*in)->pas);
+
+	srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL);
+	if (!srq->wrid) {
+		mlx5_ib_dbg(dev, "kmalloc failed %lu\n",
+			    (unsigned long)(srq->msrq.max * sizeof(u64)));
+		err = -ENOMEM;
+		goto err_in;
+	}
+	srq->wq_sig = !!srq_signature;
+
+	(*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT;
+
+	return 0;
+
+err_in:
+	mlx5_vfree(*in);
+
+err_buf:
+	mlx5_buf_free(&dev->mdev, &srq->buf);
+
+err_db:
+	mlx5_db_free(&dev->mdev, &srq->db);
+	return err;
+}
+
+static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq)
+{
+	mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+	ib_umem_release(srq->umem);
+}
+
+
+static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq)
+{
+	kfree(srq->wrid);
+	mlx5_buf_free(&dev->mdev, &srq->buf);
+	mlx5_db_free(&dev->mdev, &srq->db);
+}
+
+struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_srq *srq;
+	int desc_size;
+	int buf_size;
+	int err;
+	struct mlx5_create_srq_mbox_in *uninitialized_var(in);
+	int uninitialized_var(inlen);
+	int is_xrc;
+	u32 flgs, xrcdn;
+
+	/* Sanity check SRQ size before proceeding */
+	if (init_attr->attr.max_wr >= dev->mdev.caps.max_srq_wqes) {
+		mlx5_ib_dbg(dev, "max_wr %d, cap %d\n",
+			    init_attr->attr.max_wr,
+			    dev->mdev.caps.max_srq_wqes);
+		return ERR_PTR(-EINVAL);
+	}
+
+	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&srq->mutex);
+	spin_lock_init(&srq->lock);
+	srq->msrq.max    = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+	srq->msrq.max_gs = init_attr->attr.max_sge;
+
+	desc_size = sizeof(struct mlx5_wqe_srq_next_seg) +
+		    srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg);
+	desc_size = roundup_pow_of_two(desc_size);
+	desc_size = max_t(int, 32, desc_size);
+	srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) /
+		sizeof(struct mlx5_wqe_data_seg);
+	srq->msrq.wqe_shift = ilog2(desc_size);
+	buf_size = srq->msrq.max * desc_size;
+	mlx5_ib_dbg(dev, "desc_size 0x%x, req wr 0x%x, srq size 0x%x, max_gs 0x%x, max_avail_gather 0x%x\n",
+		    desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs,
+		    srq->msrq.max_avail_gather);
+
+	if (pd->uobject)
+		err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen);
+	else
+		err = create_srq_kernel(dev, srq, &in, buf_size, &inlen);
+
+	if (err) {
+		mlx5_ib_warn(dev, "create srq %s failed, err %d\n",
+			     pd->uobject ? "user" : "kernel", err);
+		goto err_srq;
+	}
+
+	is_xrc = (init_attr->srq_type == IB_SRQT_XRC);
+	in->ctx.state_log_sz = ilog2(srq->msrq.max);
+	flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24;
+	xrcdn = 0;
+	if (is_xrc) {
+		xrcdn = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn;
+		in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(init_attr->ext.xrc.cq)->mcq.cqn);
+	} else if (init_attr->srq_type == IB_SRQT_BASIC) {
+		xrcdn = to_mxrcd(dev->devr.x0)->xrcdn;
+		in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(dev->devr.c0)->mcq.cqn);
+	}
+
+	in->ctx.flags_xrcd = cpu_to_be32((flgs & 0xFF000000) | (xrcdn & 0xFFFFFF));
+
+	in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn);
+	in->ctx.db_record = cpu_to_be64(srq->db.dma);
+	err = mlx5_core_create_srq(&dev->mdev, &srq->msrq, in, inlen);
+	mlx5_vfree(in);
+	if (err) {
+		mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err);
+		goto err_srq;
+	}
+
+	mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn);
+
+	srq->msrq.event = mlx5_ib_srq_event;
+	srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn;
+
+	if (pd->uobject)
+		if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) {
+			mlx5_ib_dbg(dev, "copy to user failed\n");
+			err = -EFAULT;
+			goto err_core;
+		}
+
+	init_attr->attr.max_wr = srq->msrq.max - 1;
+
+	return &srq->ibsrq;
+
+err_core:
+	mlx5_core_destroy_srq(&dev->mdev, &srq->msrq);
+	if (pd->uobject)
+		destroy_srq_user(pd, srq);
+	else
+		destroy_srq_kernel(dev, srq);
+
+err_srq:
+	kfree(srq);
+
+	return ERR_PTR(err);
+}
+
+int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx5_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+
+	/* We don't support resizing SRQs yet */
+	if (attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		if (attr->srq_limit >= srq->msrq.max)
+			return -EINVAL;
+
+		mutex_lock(&srq->mutex);
+		ret = mlx5_core_arm_srq(&dev->mdev, &srq->msrq, attr->srq_limit, 1);
+		mutex_unlock(&srq->mutex);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx5_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+	struct mlx5_query_srq_mbox_out *out;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	ret = mlx5_core_query_srq(&dev->mdev, &srq->msrq, out);
+	if (ret)
+		goto out_box;
+
+	srq_attr->srq_limit = be16_to_cpu(out->ctx.lwm);
+	srq_attr->max_wr    = srq->msrq.max - 1;
+	srq_attr->max_sge   = srq->msrq.max_gs;
+
+out_box:
+	kfree(out);
+	return ret;
+}
+
+int mlx5_ib_destroy_srq(struct ib_srq *srq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(srq->device);
+	struct mlx5_ib_srq *msrq = to_msrq(srq);
+
+	mlx5_core_destroy_srq(&dev->mdev, &msrq->msrq);
+
+	if (srq->uobject) {
+		mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+		ib_umem_release(msrq->umem);
+	} else {
+		kfree(msrq->wrid);
+		mlx5_buf_free(&dev->mdev, &msrq->buf);
+		mlx5_db_free(&dev->mdev, &msrq->db);
+	}
+
+	kfree(srq);
+	return 0;
+}
+
+void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index)
+{
+	struct mlx5_wqe_srq_next_seg *next;
+
+	/* always called with interrupts disabled. */
+	spin_lock(&srq->lock);
+
+	next = get_wqe(srq, srq->tail);
+	next->next_wqe_index = cpu_to_be16(wqe_index);
+	srq->tail = wqe_index;
+
+	spin_unlock(&srq->lock);
+}
+
+int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr)
+{
+	struct mlx5_ib_srq *srq = to_msrq(ibsrq);
+	struct mlx5_wqe_srq_next_seg *next;
+	struct mlx5_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (unlikely(srq->head == srq->tail)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		srq->wrid[srq->head] = wr->wr_id;
+
+		next      = get_wqe(srq, srq->head);
+		srq->head = be16_to_cpu(next->next_wqe_index);
+		scat      = (struct mlx5_wqe_data_seg *)(next + 1);
+
+		for (i = 0; i < wr->num_sge; i++) {
+			scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+			scat[i].lkey       = cpu_to_be32(wr->sg_list[i].lkey);
+			scat[i].addr       = cpu_to_be64(wr->sg_list[i].addr);
+		}
+
+		if (i < srq->msrq.max_avail_gather) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX5_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+	}
+
+	if (likely(nreq)) {
+		srq->wqe_ctr += nreq;
+
+		/* Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*srq->db.db = cpu_to_be32(srq->wqe_ctr);
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h
new file mode 100644
index 000000000000..a886de3e593c
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/user.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_IB_USER_H
+#define MLX5_IB_USER_H
+
+#include <linux/types.h>
+
+enum {
+	MLX5_QP_FLAG_SIGNATURE		= 1 << 0,
+	MLX5_QP_FLAG_SCATTER_CQE	= 1 << 1,
+};
+
+enum {
+	MLX5_SRQ_FLAG_SIGNATURE		= 1 << 0,
+};
+
+
+/* Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MLX5_IB_UVERBS_ABI_VERSION	1
+
+/* Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx5_ib_alloc_ucontext_req {
+	__u32	total_num_uuars;
+	__u32	num_low_latency_uuars;
+};
+
+struct mlx5_ib_alloc_ucontext_resp {
+	__u32	qp_tab_size;
+	__u32	bf_reg_size;
+	__u32	tot_uuars;
+	__u32	cache_line_size;
+	__u16	max_sq_desc_sz;
+	__u16	max_rq_desc_sz;
+	__u32	max_send_wqebb;
+	__u32	max_recv_wr;
+	__u32	max_srq_recv_wr;
+	__u16	num_ports;
+	__u16	reserved;
+};
+
+struct mlx5_ib_alloc_pd_resp {
+	__u32	pdn;
+};
+
+struct mlx5_ib_create_cq {
+	__u64	buf_addr;
+	__u64	db_addr;
+	__u32	cqe_size;
+};
+
+struct mlx5_ib_create_cq_resp {
+	__u32	cqn;
+	__u32	reserved;
+};
+
+struct mlx5_ib_resize_cq {
+	__u64	buf_addr;
+};
+
+struct mlx5_ib_create_srq {
+	__u64	buf_addr;
+	__u64	db_addr;
+	__u32	flags;
+};
+
+struct mlx5_ib_create_srq_resp {
+	__u32	srqn;
+	__u32	reserved;
+};
+
+struct mlx5_ib_create_qp {
+	__u64	buf_addr;
+	__u64	db_addr;
+	__u32	sq_wqe_count;
+	__u32	rq_wqe_count;
+	__u32	rq_wqe_shift;
+	__u32	flags;
+};
+
+struct mlx5_ib_create_qp_resp {
+	__u32	uuar_index;
+};
+#endif /* MLX5_IB_USER_H */
diff --git a/drivers/net/ethernet/mellanox/Kconfig b/drivers/net/ethernet/mellanox/Kconfig
index bcdbc14aeff0..8cf7563a8d92 100644
--- a/drivers/net/ethernet/mellanox/Kconfig
+++ b/drivers/net/ethernet/mellanox/Kconfig
@@ -19,5 +19,6 @@ config NET_VENDOR_MELLANOX
 if NET_VENDOR_MELLANOX
 
 source "drivers/net/ethernet/mellanox/mlx4/Kconfig"
+source "drivers/net/ethernet/mellanox/mlx5/core/Kconfig"
 
 endif # NET_VENDOR_MELLANOX
diff --git a/drivers/net/ethernet/mellanox/Makefile b/drivers/net/ethernet/mellanox/Makefile
index 37afb9683372..38fe32ef5e5f 100644
--- a/drivers/net/ethernet/mellanox/Makefile
+++ b/drivers/net/ethernet/mellanox/Makefile
@@ -3,3 +3,4 @@
 #
 
 obj-$(CONFIG_MLX4_CORE) += mlx4/
+obj-$(CONFIG_MLX5_CORE) += mlx5/core/
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Kconfig b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
new file mode 100644
index 000000000000..21962828925a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Kconfig
@@ -0,0 +1,18 @@
+#
+# Mellanox driver configuration
+#
+
+config MLX5_CORE
+	tristate
+	depends on PCI && X86
+	default n
+
+config MLX5_DEBUG
+	bool "Verbose debugging output" if (MLX5_CORE && EXPERT)
+	depends on MLX5_CORE
+	default y
+	---help---
+	  This option causes debugging code to be compiled into the
+	  mlx5_core driver.  The output can be turned on via the
+	  debug_mask module parameter (which can also be set after
+	  the driver is loaded through sysfs).
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
new file mode 100644
index 000000000000..105780bb980b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_MLX5_CORE)		+= mlx5_core.o
+
+mlx5_core-y :=	main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \
+		health.o mcg.o cq.o srq.o alloc.o qp.o port.o mr.o pd.o   \
+		mad.o
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/alloc.c b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
new file mode 100644
index 000000000000..b215742b842f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/alloc.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+#include <linux/bitmap.h>
+#include <linux/dma-mapping.h>
+#include <linux/vmalloc.h>
+#include <linux/mlx5/driver.h>
+
+#include "mlx5_core.h"
+
+/* Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0.  If the
+ * requested size is > max_direct, we split the allocation into
+ * multiple pages, so we don't require too much contiguous memory.
+ */
+
+int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, int max_direct,
+		   struct mlx5_buf *buf)
+{
+	dma_addr_t t;
+
+	buf->size = size;
+	if (size <= max_direct) {
+		buf->nbufs        = 1;
+		buf->npages       = 1;
+		buf->page_shift   = get_order(size) + PAGE_SHIFT;
+		buf->direct.buf   = dma_zalloc_coherent(&dev->pdev->dev,
+							size, &t, GFP_KERNEL);
+		if (!buf->direct.buf)
+			return -ENOMEM;
+
+		buf->direct.map = t;
+
+		while (t & ((1 << buf->page_shift) - 1)) {
+			--buf->page_shift;
+			buf->npages *= 2;
+		}
+	} else {
+		int i;
+
+		buf->direct.buf  = NULL;
+		buf->nbufs       = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		buf->npages      = buf->nbufs;
+		buf->page_shift  = PAGE_SHIFT;
+		buf->page_list   = kcalloc(buf->nbufs, sizeof(*buf->page_list),
+					   GFP_KERNEL);
+		if (!buf->page_list)
+			return -ENOMEM;
+
+		for (i = 0; i < buf->nbufs; i++) {
+			buf->page_list[i].buf =
+				dma_zalloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+						    &t, GFP_KERNEL);
+			if (!buf->page_list[i].buf)
+				goto err_free;
+
+			buf->page_list[i].map = t;
+		}
+
+		if (BITS_PER_LONG == 64) {
+			struct page **pages;
+			pages = kmalloc(sizeof(*pages) * buf->nbufs, GFP_KERNEL);
+			if (!pages)
+				goto err_free;
+			for (i = 0; i < buf->nbufs; i++)
+				pages[i] = virt_to_page(buf->page_list[i].buf);
+			buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP, PAGE_KERNEL);
+			kfree(pages);
+			if (!buf->direct.buf)
+				goto err_free;
+		}
+	}
+
+	return 0;
+
+err_free:
+	mlx5_buf_free(dev, buf);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(mlx5_buf_alloc);
+
+void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf)
+{
+	int i;
+
+	if (buf->nbufs == 1)
+		dma_free_coherent(&dev->pdev->dev, buf->size, buf->direct.buf,
+				  buf->direct.map);
+	else {
+		if (BITS_PER_LONG == 64 && buf->direct.buf)
+			vunmap(buf->direct.buf);
+
+		for (i = 0; i < buf->nbufs; i++)
+			if (buf->page_list[i].buf)
+				dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+						  buf->page_list[i].buf,
+						  buf->page_list[i].map);
+		kfree(buf->page_list);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx5_buf_free);
+
+static struct mlx5_db_pgdir *mlx5_alloc_db_pgdir(struct device *dma_device)
+{
+	struct mlx5_db_pgdir *pgdir;
+
+	pgdir = kzalloc(sizeof(*pgdir), GFP_KERNEL);
+	if (!pgdir)
+		return NULL;
+
+	bitmap_fill(pgdir->bitmap, MLX5_DB_PER_PAGE);
+	pgdir->db_page = dma_alloc_coherent(dma_device, PAGE_SIZE,
+					    &pgdir->db_dma, GFP_KERNEL);
+	if (!pgdir->db_page) {
+		kfree(pgdir);
+		return NULL;
+	}
+
+	return pgdir;
+}
+
+static int mlx5_alloc_db_from_pgdir(struct mlx5_db_pgdir *pgdir,
+				    struct mlx5_db *db)
+{
+	int offset;
+	int i;
+
+	i = find_first_bit(pgdir->bitmap, MLX5_DB_PER_PAGE);
+	if (i >= MLX5_DB_PER_PAGE)
+		return -ENOMEM;
+
+	__clear_bit(i, pgdir->bitmap);
+
+	db->u.pgdir = pgdir;
+	db->index   = i;
+	offset = db->index * L1_CACHE_BYTES;
+	db->db      = pgdir->db_page + offset / sizeof(*pgdir->db_page);
+	db->dma     = pgdir->db_dma  + offset;
+
+	return 0;
+}
+
+int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db)
+{
+	struct mlx5_db_pgdir *pgdir;
+	int ret = 0;
+
+	mutex_lock(&dev->priv.pgdir_mutex);
+
+	list_for_each_entry(pgdir, &dev->priv.pgdir_list, list)
+		if (!mlx5_alloc_db_from_pgdir(pgdir, db))
+			goto out;
+
+	pgdir = mlx5_alloc_db_pgdir(&(dev->pdev->dev));
+	if (!pgdir) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_add(&pgdir->list, &dev->priv.pgdir_list);
+
+	/* This should never fail -- we just allocated an empty page: */
+	WARN_ON(mlx5_alloc_db_from_pgdir(pgdir, db));
+
+out:
+	mutex_unlock(&dev->priv.pgdir_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mlx5_db_alloc);
+
+void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db)
+{
+	mutex_lock(&dev->priv.pgdir_mutex);
+
+	__set_bit(db->index, db->u.pgdir->bitmap);
+
+	if (bitmap_full(db->u.pgdir->bitmap, MLX5_DB_PER_PAGE)) {
+		dma_free_coherent(&(dev->pdev->dev), PAGE_SIZE,
+				  db->u.pgdir->db_page, db->u.pgdir->db_dma);
+		list_del(&db->u.pgdir->list);
+		kfree(db->u.pgdir);
+	}
+
+	mutex_unlock(&dev->priv.pgdir_mutex);
+}
+EXPORT_SYMBOL_GPL(mlx5_db_free);
+
+
+void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas)
+{
+	u64 addr;
+	int i;
+
+	for (i = 0; i < buf->npages; i++) {
+		if (buf->nbufs == 1)
+			addr = buf->direct.map + (i << buf->page_shift);
+		else
+			addr = buf->page_list[i].map;
+
+		pas[i] = cpu_to_be64(addr);
+	}
+}
+EXPORT_SYMBOL_GPL(mlx5_fill_page_array);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
new file mode 100644
index 000000000000..c1c0eef89694
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -0,0 +1,1515 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm-generic/kmap_types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/random.h>
+#include <linux/io-mapping.h>
+#include <linux/mlx5/driver.h>
+#include <linux/debugfs.h>
+
+#include "mlx5_core.h"
+
+enum {
+	CMD_IF_REV = 3,
+};
+
+enum {
+	CMD_MODE_POLLING,
+	CMD_MODE_EVENTS
+};
+
+enum {
+	NUM_LONG_LISTS	  = 2,
+	NUM_MED_LISTS	  = 64,
+	LONG_LIST_SIZE	  = (2ULL * 1024 * 1024 * 1024 / PAGE_SIZE) * 8 + 16 +
+				MLX5_CMD_DATA_BLOCK_SIZE,
+	MED_LIST_SIZE	  = 16 + MLX5_CMD_DATA_BLOCK_SIZE,
+};
+
+enum {
+	MLX5_CMD_DELIVERY_STAT_OK			= 0x0,
+	MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR		= 0x1,
+	MLX5_CMD_DELIVERY_STAT_TOK_ERR			= 0x2,
+	MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR		= 0x3,
+	MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR	= 0x4,
+	MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR		= 0x5,
+	MLX5_CMD_DELIVERY_STAT_FW_ERR			= 0x6,
+	MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR		= 0x7,
+	MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR		= 0x8,
+	MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR	= 0x9,
+	MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR		= 0x10,
+};
+
+enum {
+	MLX5_CMD_STAT_OK			= 0x0,
+	MLX5_CMD_STAT_INT_ERR			= 0x1,
+	MLX5_CMD_STAT_BAD_OP_ERR		= 0x2,
+	MLX5_CMD_STAT_BAD_PARAM_ERR		= 0x3,
+	MLX5_CMD_STAT_BAD_SYS_STATE_ERR		= 0x4,
+	MLX5_CMD_STAT_BAD_RES_ERR		= 0x5,
+	MLX5_CMD_STAT_RES_BUSY			= 0x6,
+	MLX5_CMD_STAT_LIM_ERR			= 0x8,
+	MLX5_CMD_STAT_BAD_RES_STATE_ERR		= 0x9,
+	MLX5_CMD_STAT_IX_ERR			= 0xa,
+	MLX5_CMD_STAT_NO_RES_ERR		= 0xf,
+	MLX5_CMD_STAT_BAD_INP_LEN_ERR		= 0x50,
+	MLX5_CMD_STAT_BAD_OUTP_LEN_ERR		= 0x51,
+	MLX5_CMD_STAT_BAD_QP_STATE_ERR		= 0x10,
+	MLX5_CMD_STAT_BAD_PKT_ERR		= 0x30,
+	MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR	= 0x40,
+};
+
+static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd,
+					   struct mlx5_cmd_msg *in,
+					   struct mlx5_cmd_msg *out,
+					   mlx5_cmd_cbk_t cbk,
+					   void *context, int page_queue)
+{
+	gfp_t alloc_flags = cbk ? GFP_ATOMIC : GFP_KERNEL;
+	struct mlx5_cmd_work_ent *ent;
+
+	ent = kzalloc(sizeof(*ent), alloc_flags);
+	if (!ent)
+		return ERR_PTR(-ENOMEM);
+
+	ent->in		= in;
+	ent->out	= out;
+	ent->callback	= cbk;
+	ent->context	= context;
+	ent->cmd	= cmd;
+	ent->page_queue = page_queue;
+
+	return ent;
+}
+
+static u8 alloc_token(struct mlx5_cmd *cmd)
+{
+	u8 token;
+
+	spin_lock(&cmd->token_lock);
+	token = cmd->token++ % 255 + 1;
+	spin_unlock(&cmd->token_lock);
+
+	return token;
+}
+
+static int alloc_ent(struct mlx5_cmd *cmd)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cmd->alloc_lock, flags);
+	ret = find_first_bit(&cmd->bitmask, cmd->max_reg_cmds);
+	if (ret < cmd->max_reg_cmds)
+		clear_bit(ret, &cmd->bitmask);
+	spin_unlock_irqrestore(&cmd->alloc_lock, flags);
+
+	return ret < cmd->max_reg_cmds ? ret : -ENOMEM;
+}
+
+static void free_ent(struct mlx5_cmd *cmd, int idx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cmd->alloc_lock, flags);
+	set_bit(idx, &cmd->bitmask);
+	spin_unlock_irqrestore(&cmd->alloc_lock, flags);
+}
+
+static struct mlx5_cmd_layout *get_inst(struct mlx5_cmd *cmd, int idx)
+{
+	return cmd->cmd_buf + (idx << cmd->log_stride);
+}
+
+static u8 xor8_buf(void *buf, int len)
+{
+	u8 *ptr = buf;
+	u8 sum = 0;
+	int i;
+
+	for (i = 0; i < len; i++)
+		sum ^= ptr[i];
+
+	return sum;
+}
+
+static int verify_block_sig(struct mlx5_cmd_prot_block *block)
+{
+	if (xor8_buf(block->rsvd0, sizeof(*block) - sizeof(block->data) - 1) != 0xff)
+		return -EINVAL;
+
+	if (xor8_buf(block, sizeof(*block)) != 0xff)
+		return -EINVAL;
+
+	return 0;
+}
+
+static void calc_block_sig(struct mlx5_cmd_prot_block *block, u8 token)
+{
+	block->token = token;
+	block->ctrl_sig = ~xor8_buf(block->rsvd0, sizeof(*block) - sizeof(block->data) - 2);
+	block->sig = ~xor8_buf(block, sizeof(*block) - 1);
+}
+
+static void calc_chain_sig(struct mlx5_cmd_msg *msg, u8 token)
+{
+	struct mlx5_cmd_mailbox *next = msg->next;
+
+	while (next) {
+		calc_block_sig(next->buf, token);
+		next = next->next;
+	}
+}
+
+static void set_signature(struct mlx5_cmd_work_ent *ent)
+{
+	ent->lay->sig = ~xor8_buf(ent->lay, sizeof(*ent->lay));
+	calc_chain_sig(ent->in, ent->token);
+	calc_chain_sig(ent->out, ent->token);
+}
+
+static void poll_timeout(struct mlx5_cmd_work_ent *ent)
+{
+	unsigned long poll_end = jiffies + msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC + 1000);
+	u8 own;
+
+	do {
+		own = ent->lay->status_own;
+		if (!(own & CMD_OWNER_HW)) {
+			ent->ret = 0;
+			return;
+		}
+		usleep_range(5000, 10000);
+	} while (time_before(jiffies, poll_end));
+
+	ent->ret = -ETIMEDOUT;
+}
+
+static void free_cmd(struct mlx5_cmd_work_ent *ent)
+{
+	kfree(ent);
+}
+
+
+static int verify_signature(struct mlx5_cmd_work_ent *ent)
+{
+	struct mlx5_cmd_mailbox *next = ent->out->next;
+	int err;
+	u8 sig;
+
+	sig = xor8_buf(ent->lay, sizeof(*ent->lay));
+	if (sig != 0xff)
+		return -EINVAL;
+
+	while (next) {
+		err = verify_block_sig(next->buf);
+		if (err)
+			return err;
+
+		next = next->next;
+	}
+
+	return 0;
+}
+
+static void dump_buf(void *buf, int size, int data_only, int offset)
+{
+	__be32 *p = buf;
+	int i;
+
+	for (i = 0; i < size; i += 16) {
+		pr_debug("%03x: %08x %08x %08x %08x\n", offset, be32_to_cpu(p[0]),
+			 be32_to_cpu(p[1]), be32_to_cpu(p[2]),
+			 be32_to_cpu(p[3]));
+		p += 4;
+		offset += 16;
+	}
+	if (!data_only)
+		pr_debug("\n");
+}
+
+const char *mlx5_command_str(int command)
+{
+	switch (command) {
+	case MLX5_CMD_OP_QUERY_HCA_CAP:
+		return "QUERY_HCA_CAP";
+
+	case MLX5_CMD_OP_SET_HCA_CAP:
+		return "SET_HCA_CAP";
+
+	case MLX5_CMD_OP_QUERY_ADAPTER:
+		return "QUERY_ADAPTER";
+
+	case MLX5_CMD_OP_INIT_HCA:
+		return "INIT_HCA";
+
+	case MLX5_CMD_OP_TEARDOWN_HCA:
+		return "TEARDOWN_HCA";
+
+	case MLX5_CMD_OP_QUERY_PAGES:
+		return "QUERY_PAGES";
+
+	case MLX5_CMD_OP_MANAGE_PAGES:
+		return "MANAGE_PAGES";
+
+	case MLX5_CMD_OP_CREATE_MKEY:
+		return "CREATE_MKEY";
+
+	case MLX5_CMD_OP_QUERY_MKEY:
+		return "QUERY_MKEY";
+
+	case MLX5_CMD_OP_DESTROY_MKEY:
+		return "DESTROY_MKEY";
+
+	case MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS:
+		return "QUERY_SPECIAL_CONTEXTS";
+
+	case MLX5_CMD_OP_CREATE_EQ:
+		return "CREATE_EQ";
+
+	case MLX5_CMD_OP_DESTROY_EQ:
+		return "DESTROY_EQ";
+
+	case MLX5_CMD_OP_QUERY_EQ:
+		return "QUERY_EQ";
+
+	case MLX5_CMD_OP_CREATE_CQ:
+		return "CREATE_CQ";
+
+	case MLX5_CMD_OP_DESTROY_CQ:
+		return "DESTROY_CQ";
+
+	case MLX5_CMD_OP_QUERY_CQ:
+		return "QUERY_CQ";
+
+	case MLX5_CMD_OP_MODIFY_CQ:
+		return "MODIFY_CQ";
+
+	case MLX5_CMD_OP_CREATE_QP:
+		return "CREATE_QP";
+
+	case MLX5_CMD_OP_DESTROY_QP:
+		return "DESTROY_QP";
+
+	case MLX5_CMD_OP_RST2INIT_QP:
+		return "RST2INIT_QP";
+
+	case MLX5_CMD_OP_INIT2RTR_QP:
+		return "INIT2RTR_QP";
+
+	case MLX5_CMD_OP_RTR2RTS_QP:
+		return "RTR2RTS_QP";
+
+	case MLX5_CMD_OP_RTS2RTS_QP:
+		return "RTS2RTS_QP";
+
+	case MLX5_CMD_OP_SQERR2RTS_QP:
+		return "SQERR2RTS_QP";
+
+	case MLX5_CMD_OP_2ERR_QP:
+		return "2ERR_QP";
+
+	case MLX5_CMD_OP_RTS2SQD_QP:
+		return "RTS2SQD_QP";
+
+	case MLX5_CMD_OP_SQD2RTS_QP:
+		return "SQD2RTS_QP";
+
+	case MLX5_CMD_OP_2RST_QP:
+		return "2RST_QP";
+
+	case MLX5_CMD_OP_QUERY_QP:
+		return "QUERY_QP";
+
+	case MLX5_CMD_OP_CONF_SQP:
+		return "CONF_SQP";
+
+	case MLX5_CMD_OP_MAD_IFC:
+		return "MAD_IFC";
+
+	case MLX5_CMD_OP_INIT2INIT_QP:
+		return "INIT2INIT_QP";
+
+	case MLX5_CMD_OP_SUSPEND_QP:
+		return "SUSPEND_QP";
+
+	case MLX5_CMD_OP_UNSUSPEND_QP:
+		return "UNSUSPEND_QP";
+
+	case MLX5_CMD_OP_SQD2SQD_QP:
+		return "SQD2SQD_QP";
+
+	case MLX5_CMD_OP_ALLOC_QP_COUNTER_SET:
+		return "ALLOC_QP_COUNTER_SET";
+
+	case MLX5_CMD_OP_DEALLOC_QP_COUNTER_SET:
+		return "DEALLOC_QP_COUNTER_SET";
+
+	case MLX5_CMD_OP_QUERY_QP_COUNTER_SET:
+		return "QUERY_QP_COUNTER_SET";
+
+	case MLX5_CMD_OP_CREATE_PSV:
+		return "CREATE_PSV";
+
+	case MLX5_CMD_OP_DESTROY_PSV:
+		return "DESTROY_PSV";
+
+	case MLX5_CMD_OP_QUERY_PSV:
+		return "QUERY_PSV";
+
+	case MLX5_CMD_OP_QUERY_SIG_RULE_TABLE:
+		return "QUERY_SIG_RULE_TABLE";
+
+	case MLX5_CMD_OP_QUERY_BLOCK_SIZE_TABLE:
+		return "QUERY_BLOCK_SIZE_TABLE";
+
+	case MLX5_CMD_OP_CREATE_SRQ:
+		return "CREATE_SRQ";
+
+	case MLX5_CMD_OP_DESTROY_SRQ:
+		return "DESTROY_SRQ";
+
+	case MLX5_CMD_OP_QUERY_SRQ:
+		return "QUERY_SRQ";
+
+	case MLX5_CMD_OP_ARM_RQ:
+		return "ARM_RQ";
+
+	case MLX5_CMD_OP_RESIZE_SRQ:
+		return "RESIZE_SRQ";
+
+	case MLX5_CMD_OP_ALLOC_PD:
+		return "ALLOC_PD";
+
+	case MLX5_CMD_OP_DEALLOC_PD:
+		return "DEALLOC_PD";
+
+	case MLX5_CMD_OP_ALLOC_UAR:
+		return "ALLOC_UAR";
+
+	case MLX5_CMD_OP_DEALLOC_UAR:
+		return "DEALLOC_UAR";
+
+	case MLX5_CMD_OP_ATTACH_TO_MCG:
+		return "ATTACH_TO_MCG";
+
+	case MLX5_CMD_OP_DETACH_FROM_MCG:
+		return "DETACH_FROM_MCG";
+
+	case MLX5_CMD_OP_ALLOC_XRCD:
+		return "ALLOC_XRCD";
+
+	case MLX5_CMD_OP_DEALLOC_XRCD:
+		return "DEALLOC_XRCD";
+
+	case MLX5_CMD_OP_ACCESS_REG:
+		return "MLX5_CMD_OP_ACCESS_REG";
+
+	default: return "unknown command opcode";
+	}
+}
+
+static void dump_command(struct mlx5_core_dev *dev,
+			 struct mlx5_cmd_work_ent *ent, int input)
+{
+	u16 op = be16_to_cpu(((struct mlx5_inbox_hdr *)(ent->lay->in))->opcode);
+	struct mlx5_cmd_msg *msg = input ? ent->in : ent->out;
+	struct mlx5_cmd_mailbox *next = msg->next;
+	int data_only;
+	int offset = 0;
+	int dump_len;
+
+	data_only = !!(mlx5_core_debug_mask & (1 << MLX5_CMD_DATA));
+
+	if (data_only)
+		mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_DATA,
+				   "dump command data %s(0x%x) %s\n",
+				   mlx5_command_str(op), op,
+				   input ? "INPUT" : "OUTPUT");
+	else
+		mlx5_core_dbg(dev, "dump command %s(0x%x) %s\n",
+			      mlx5_command_str(op), op,
+			      input ? "INPUT" : "OUTPUT");
+
+	if (data_only) {
+		if (input) {
+			dump_buf(ent->lay->in, sizeof(ent->lay->in), 1, offset);
+			offset += sizeof(ent->lay->in);
+		} else {
+			dump_buf(ent->lay->out, sizeof(ent->lay->out), 1, offset);
+			offset += sizeof(ent->lay->out);
+		}
+	} else {
+		dump_buf(ent->lay, sizeof(*ent->lay), 0, offset);
+		offset += sizeof(*ent->lay);
+	}
+
+	while (next && offset < msg->len) {
+		if (data_only) {
+			dump_len = min_t(int, MLX5_CMD_DATA_BLOCK_SIZE, msg->len - offset);
+			dump_buf(next->buf, dump_len, 1, offset);
+			offset += MLX5_CMD_DATA_BLOCK_SIZE;
+		} else {
+			mlx5_core_dbg(dev, "command block:\n");
+			dump_buf(next->buf, sizeof(struct mlx5_cmd_prot_block), 0, offset);
+			offset += sizeof(struct mlx5_cmd_prot_block);
+		}
+		next = next->next;
+	}
+
+	if (data_only)
+		pr_debug("\n");
+}
+
+static void cmd_work_handler(struct work_struct *work)
+{
+	struct mlx5_cmd_work_ent *ent = container_of(work, struct mlx5_cmd_work_ent, work);
+	struct mlx5_cmd *cmd = ent->cmd;
+	struct mlx5_core_dev *dev = container_of(cmd, struct mlx5_core_dev, cmd);
+	struct mlx5_cmd_layout *lay;
+	struct semaphore *sem;
+
+	sem = ent->page_queue ? &cmd->pages_sem : &cmd->sem;
+	down(sem);
+	if (!ent->page_queue) {
+		ent->idx = alloc_ent(cmd);
+		if (ent->idx < 0) {
+			mlx5_core_err(dev, "failed to allocate command entry\n");
+			up(sem);
+			return;
+		}
+	} else {
+		ent->idx = cmd->max_reg_cmds;
+	}
+
+	ent->token = alloc_token(cmd);
+	cmd->ent_arr[ent->idx] = ent;
+	lay = get_inst(cmd, ent->idx);
+	ent->lay = lay;
+	memset(lay, 0, sizeof(*lay));
+	memcpy(lay->in, ent->in->first.data, sizeof(lay->in));
+	if (ent->in->next)
+		lay->in_ptr = cpu_to_be64(ent->in->next->dma);
+	lay->inlen = cpu_to_be32(ent->in->len);
+	if (ent->out->next)
+		lay->out_ptr = cpu_to_be64(ent->out->next->dma);
+	lay->outlen = cpu_to_be32(ent->out->len);
+	lay->type = MLX5_PCI_CMD_XPORT;
+	lay->token = ent->token;
+	lay->status_own = CMD_OWNER_HW;
+	if (!cmd->checksum_disabled)
+		set_signature(ent);
+	dump_command(dev, ent, 1);
+	ktime_get_ts(&ent->ts1);
+
+	/* ring doorbell after the descriptor is valid */
+	wmb();
+	iowrite32be(1 << ent->idx, &dev->iseg->cmd_dbell);
+	mlx5_core_dbg(dev, "write 0x%x to command doorbell\n", 1 << ent->idx);
+	mmiowb();
+	if (cmd->mode == CMD_MODE_POLLING) {
+		poll_timeout(ent);
+		/* make sure we read the descriptor after ownership is SW */
+		rmb();
+		mlx5_cmd_comp_handler(dev, 1UL << ent->idx);
+	}
+}
+
+static const char *deliv_status_to_str(u8 status)
+{
+	switch (status) {
+	case MLX5_CMD_DELIVERY_STAT_OK:
+		return "no errors";
+	case MLX5_CMD_DELIVERY_STAT_SIGNAT_ERR:
+		return "signature error";
+	case MLX5_CMD_DELIVERY_STAT_TOK_ERR:
+		return "token error";
+	case MLX5_CMD_DELIVERY_STAT_BAD_BLK_NUM_ERR:
+		return "bad block number";
+	case MLX5_CMD_DELIVERY_STAT_OUT_PTR_ALIGN_ERR:
+		return "output pointer not aligned to block size";
+	case MLX5_CMD_DELIVERY_STAT_IN_PTR_ALIGN_ERR:
+		return "input pointer not aligned to block size";
+	case MLX5_CMD_DELIVERY_STAT_FW_ERR:
+		return "firmware internal error";
+	case MLX5_CMD_DELIVERY_STAT_IN_LENGTH_ERR:
+		return "command input length error";
+	case MLX5_CMD_DELIVERY_STAT_OUT_LENGTH_ERR:
+		return "command ouput length error";
+	case MLX5_CMD_DELIVERY_STAT_RES_FLD_NOT_CLR_ERR:
+		return "reserved fields not cleared";
+	case MLX5_CMD_DELIVERY_STAT_CMD_DESCR_ERR:
+		return "bad command descriptor type";
+	default:
+		return "unknown status code";
+	}
+}
+
+static u16 msg_to_opcode(struct mlx5_cmd_msg *in)
+{
+	struct mlx5_inbox_hdr *hdr = (struct mlx5_inbox_hdr *)(in->first.data);
+
+	return be16_to_cpu(hdr->opcode);
+}
+
+static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent)
+{
+	unsigned long timeout = msecs_to_jiffies(MLX5_CMD_TIMEOUT_MSEC);
+	struct mlx5_cmd *cmd = &dev->cmd;
+	int err;
+
+	if (cmd->mode == CMD_MODE_POLLING) {
+		wait_for_completion(&ent->done);
+		err = ent->ret;
+	} else {
+		if (!wait_for_completion_timeout(&ent->done, timeout))
+			err = -ETIMEDOUT;
+		else
+			err = 0;
+	}
+	if (err == -ETIMEDOUT) {
+		mlx5_core_warn(dev, "%s(0x%x) timeout. Will cause a leak of a command resource\n",
+			       mlx5_command_str(msg_to_opcode(ent->in)),
+			       msg_to_opcode(ent->in));
+	}
+	mlx5_core_dbg(dev, "err %d, delivery status %s(%d)\n", err,
+		      deliv_status_to_str(ent->status), ent->status);
+
+	return err;
+}
+
+/*  Notes:
+ *    1. Callback functions may not sleep
+ *    2. page queue commands do not support asynchrous completion
+ */
+static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in,
+			   struct mlx5_cmd_msg *out, mlx5_cmd_cbk_t callback,
+			   void *context, int page_queue, u8 *status)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct mlx5_cmd_work_ent *ent;
+	ktime_t t1, t2, delta;
+	struct mlx5_cmd_stats *stats;
+	int err = 0;
+	s64 ds;
+	u16 op;
+
+	if (callback && page_queue)
+		return -EINVAL;
+
+	ent = alloc_cmd(cmd, in, out, callback, context, page_queue);
+	if (IS_ERR(ent))
+		return PTR_ERR(ent);
+
+	if (!callback)
+		init_completion(&ent->done);
+
+	INIT_WORK(&ent->work, cmd_work_handler);
+	if (page_queue) {
+		cmd_work_handler(&ent->work);
+	} else if (!queue_work(cmd->wq, &ent->work)) {
+		mlx5_core_warn(dev, "failed to queue work\n");
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	if (!callback) {
+		err = wait_func(dev, ent);
+		if (err == -ETIMEDOUT)
+			goto out;
+
+		t1 = timespec_to_ktime(ent->ts1);
+		t2 = timespec_to_ktime(ent->ts2);
+		delta = ktime_sub(t2, t1);
+		ds = ktime_to_ns(delta);
+		op = be16_to_cpu(((struct mlx5_inbox_hdr *)in->first.data)->opcode);
+		if (op < ARRAY_SIZE(cmd->stats)) {
+			stats = &cmd->stats[op];
+			spin_lock(&stats->lock);
+			stats->sum += ds;
+			++stats->n;
+			spin_unlock(&stats->lock);
+		}
+		mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_TIME,
+				   "fw exec time for %s is %lld nsec\n",
+				   mlx5_command_str(op), ds);
+		*status = ent->status;
+		free_cmd(ent);
+	}
+
+	return err;
+
+out_free:
+	free_cmd(ent);
+out:
+	return err;
+}
+
+static ssize_t dbg_write(struct file *filp, const char __user *buf,
+			 size_t count, loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	char lbuf[3];
+	int err;
+
+	if (!dbg->in_msg || !dbg->out_msg)
+		return -ENOMEM;
+
+	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
+		return -EPERM;
+
+	lbuf[sizeof(lbuf) - 1] = 0;
+
+	if (strcmp(lbuf, "go"))
+		return -EINVAL;
+
+	err = mlx5_cmd_exec(dev, dbg->in_msg, dbg->inlen, dbg->out_msg, dbg->outlen);
+
+	return err ? err : count;
+}
+
+
+static const struct file_operations fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= dbg_write,
+};
+
+static int mlx5_copy_to_msg(struct mlx5_cmd_msg *to, void *from, int size)
+{
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_mailbox *next;
+	int copy;
+
+	if (!to || !from)
+		return -ENOMEM;
+
+	copy = min_t(int, size, sizeof(to->first.data));
+	memcpy(to->first.data, from, copy);
+	size -= copy;
+	from += copy;
+
+	next = to->next;
+	while (size) {
+		if (!next) {
+			/* this is a BUG */
+			return -ENOMEM;
+		}
+
+		copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE);
+		block = next->buf;
+		memcpy(block->data, from, copy);
+		from += copy;
+		size -= copy;
+		next = next->next;
+	}
+
+	return 0;
+}
+
+static int mlx5_copy_from_msg(void *to, struct mlx5_cmd_msg *from, int size)
+{
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_mailbox *next;
+	int copy;
+
+	if (!to || !from)
+		return -ENOMEM;
+
+	copy = min_t(int, size, sizeof(from->first.data));
+	memcpy(to, from->first.data, copy);
+	size -= copy;
+	to += copy;
+
+	next = from->next;
+	while (size) {
+		if (!next) {
+			/* this is a BUG */
+			return -ENOMEM;
+		}
+
+		copy = min_t(int, size, MLX5_CMD_DATA_BLOCK_SIZE);
+		block = next->buf;
+		if (xor8_buf(block, sizeof(*block)) != 0xff)
+			return -EINVAL;
+
+		memcpy(to, block->data, copy);
+		to += copy;
+		size -= copy;
+		next = next->next;
+	}
+
+	return 0;
+}
+
+static struct mlx5_cmd_mailbox *alloc_cmd_box(struct mlx5_core_dev *dev,
+					      gfp_t flags)
+{
+	struct mlx5_cmd_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof(*mailbox), flags);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = pci_pool_alloc(dev->cmd.pool, flags,
+				      &mailbox->dma);
+	if (!mailbox->buf) {
+		mlx5_core_dbg(dev, "failed allocation\n");
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+	memset(mailbox->buf, 0, sizeof(struct mlx5_cmd_prot_block));
+	mailbox->next = NULL;
+
+	return mailbox;
+}
+
+static void free_cmd_box(struct mlx5_core_dev *dev,
+			 struct mlx5_cmd_mailbox *mailbox)
+{
+	pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+
+static struct mlx5_cmd_msg *mlx5_alloc_cmd_msg(struct mlx5_core_dev *dev,
+					       gfp_t flags, int size)
+{
+	struct mlx5_cmd_mailbox *tmp, *head = NULL;
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_msg *msg;
+	int blen;
+	int err;
+	int n;
+	int i;
+
+	msg = kzalloc(sizeof(*msg), GFP_KERNEL);
+	if (!msg)
+		return ERR_PTR(-ENOMEM);
+
+	blen = size - min_t(int, sizeof(msg->first.data), size);
+	n = (blen + MLX5_CMD_DATA_BLOCK_SIZE - 1) / MLX5_CMD_DATA_BLOCK_SIZE;
+
+	for (i = 0; i < n; i++) {
+		tmp = alloc_cmd_box(dev, flags);
+		if (IS_ERR(tmp)) {
+			mlx5_core_warn(dev, "failed allocating block\n");
+			err = PTR_ERR(tmp);
+			goto err_alloc;
+		}
+
+		block = tmp->buf;
+		tmp->next = head;
+		block->next = cpu_to_be64(tmp->next ? tmp->next->dma : 0);
+		block->block_num = cpu_to_be32(n - i - 1);
+		head = tmp;
+	}
+	msg->next = head;
+	msg->len = size;
+	return msg;
+
+err_alloc:
+	while (head) {
+		tmp = head->next;
+		free_cmd_box(dev, head);
+		head = tmp;
+	}
+	kfree(msg);
+
+	return ERR_PTR(err);
+}
+
+static void mlx5_free_cmd_msg(struct mlx5_core_dev *dev,
+				  struct mlx5_cmd_msg *msg)
+{
+	struct mlx5_cmd_mailbox *head = msg->next;
+	struct mlx5_cmd_mailbox *next;
+
+	while (head) {
+		next = head->next;
+		free_cmd_box(dev, head);
+		head = next;
+	}
+	kfree(msg);
+}
+
+static ssize_t data_write(struct file *filp, const char __user *buf,
+			  size_t count, loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	void *ptr;
+	int err;
+
+	if (*pos != 0)
+		return -EINVAL;
+
+	kfree(dbg->in_msg);
+	dbg->in_msg = NULL;
+	dbg->inlen = 0;
+
+	ptr = kzalloc(count, GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	if (copy_from_user(ptr, buf, count)) {
+		err = -EPERM;
+		goto out;
+	}
+	dbg->in_msg = ptr;
+	dbg->inlen = count;
+
+	*pos = count;
+
+	return count;
+
+out:
+	kfree(ptr);
+	return err;
+}
+
+static ssize_t data_read(struct file *filp, char __user *buf, size_t count,
+			 loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	int copy;
+
+	if (*pos)
+		return 0;
+
+	if (!dbg->out_msg)
+		return -ENOMEM;
+
+	copy = min_t(int, count, dbg->outlen);
+	if (copy_to_user(buf, dbg->out_msg, copy))
+		return -EPERM;
+
+	*pos += copy;
+
+	return copy;
+}
+
+static const struct file_operations dfops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= data_write,
+	.read	= data_read,
+};
+
+static ssize_t outlen_read(struct file *filp, char __user *buf, size_t count,
+			   loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	char outlen[8];
+	int err;
+
+	if (*pos)
+		return 0;
+
+	err = snprintf(outlen, sizeof(outlen), "%d", dbg->outlen);
+	if (err < 0)
+		return err;
+
+	if (copy_to_user(buf, &outlen, err))
+		return -EPERM;
+
+	*pos += err;
+
+	return err;
+}
+
+static ssize_t outlen_write(struct file *filp, const char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct mlx5_core_dev *dev = filp->private_data;
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	char outlen_str[8];
+	int outlen;
+	void *ptr;
+	int err;
+
+	if (*pos != 0 || count > 6)
+		return -EINVAL;
+
+	kfree(dbg->out_msg);
+	dbg->out_msg = NULL;
+	dbg->outlen = 0;
+
+	if (copy_from_user(outlen_str, buf, count))
+		return -EPERM;
+
+	outlen_str[7] = 0;
+
+	err = sscanf(outlen_str, "%d", &outlen);
+	if (err < 0)
+		return err;
+
+	ptr = kzalloc(outlen, GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+
+	dbg->out_msg = ptr;
+	dbg->outlen = outlen;
+
+	*pos = count;
+
+	return count;
+}
+
+static const struct file_operations olfops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= outlen_write,
+	.read	= outlen_read,
+};
+
+static void set_wqname(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+
+	snprintf(cmd->wq_name, sizeof(cmd->wq_name), "mlx5_cmd_%s",
+		 dev_name(&dev->pdev->dev));
+}
+
+static void clean_debug_files(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+
+	if (!mlx5_debugfs_root)
+		return;
+
+	mlx5_cmdif_debugfs_cleanup(dev);
+	debugfs_remove_recursive(dbg->dbg_root);
+}
+
+static int create_debugfs_files(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_debug *dbg = &dev->cmd.dbg;
+	int err = -ENOMEM;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dbg->dbg_root = debugfs_create_dir("cmd", dev->priv.dbg_root);
+	if (!dbg->dbg_root)
+		return err;
+
+	dbg->dbg_in = debugfs_create_file("in", 0400, dbg->dbg_root,
+					  dev, &dfops);
+	if (!dbg->dbg_in)
+		goto err_dbg;
+
+	dbg->dbg_out = debugfs_create_file("out", 0200, dbg->dbg_root,
+					   dev, &dfops);
+	if (!dbg->dbg_out)
+		goto err_dbg;
+
+	dbg->dbg_outlen = debugfs_create_file("out_len", 0600, dbg->dbg_root,
+					      dev, &olfops);
+	if (!dbg->dbg_outlen)
+		goto err_dbg;
+
+	dbg->dbg_status = debugfs_create_u8("status", 0600, dbg->dbg_root,
+					    &dbg->status);
+	if (!dbg->dbg_status)
+		goto err_dbg;
+
+	dbg->dbg_run = debugfs_create_file("run", 0200, dbg->dbg_root, dev, &fops);
+	if (!dbg->dbg_run)
+		goto err_dbg;
+
+	mlx5_cmdif_debugfs_init(dev);
+
+	return 0;
+
+err_dbg:
+	clean_debug_files(dev);
+	return err;
+}
+
+void mlx5_cmd_use_events(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	int i;
+
+	for (i = 0; i < cmd->max_reg_cmds; i++)
+		down(&cmd->sem);
+
+	down(&cmd->pages_sem);
+
+	flush_workqueue(cmd->wq);
+
+	cmd->mode = CMD_MODE_EVENTS;
+
+	up(&cmd->pages_sem);
+	for (i = 0; i < cmd->max_reg_cmds; i++)
+		up(&cmd->sem);
+}
+
+void mlx5_cmd_use_polling(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	int i;
+
+	for (i = 0; i < cmd->max_reg_cmds; i++)
+		down(&cmd->sem);
+
+	down(&cmd->pages_sem);
+
+	flush_workqueue(cmd->wq);
+	cmd->mode = CMD_MODE_POLLING;
+
+	up(&cmd->pages_sem);
+	for (i = 0; i < cmd->max_reg_cmds; i++)
+		up(&cmd->sem);
+}
+
+void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct mlx5_cmd_work_ent *ent;
+	mlx5_cmd_cbk_t callback;
+	void *context;
+	int err;
+	int i;
+
+	for (i = 0; i < (1 << cmd->log_sz); i++) {
+		if (test_bit(i, &vector)) {
+			ent = cmd->ent_arr[i];
+			ktime_get_ts(&ent->ts2);
+			memcpy(ent->out->first.data, ent->lay->out, sizeof(ent->lay->out));
+			dump_command(dev, ent, 0);
+			if (!ent->ret) {
+				if (!cmd->checksum_disabled)
+					ent->ret = verify_signature(ent);
+				else
+					ent->ret = 0;
+				ent->status = ent->lay->status_own >> 1;
+				mlx5_core_dbg(dev, "command completed. ret 0x%x, delivery status %s(0x%x)\n",
+					      ent->ret, deliv_status_to_str(ent->status), ent->status);
+			}
+			free_ent(cmd, ent->idx);
+			if (ent->callback) {
+				callback = ent->callback;
+				context = ent->context;
+				err = ent->ret;
+				free_cmd(ent);
+				callback(err, context);
+			} else {
+				complete(&ent->done);
+			}
+			if (ent->page_queue)
+				up(&cmd->pages_sem);
+			else
+				up(&cmd->sem);
+		}
+	}
+}
+EXPORT_SYMBOL(mlx5_cmd_comp_handler);
+
+static int status_to_err(u8 status)
+{
+	return status ? -1 : 0; /* TBD more meaningful codes */
+}
+
+static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size)
+{
+	struct mlx5_cmd_msg *msg = ERR_PTR(-ENOMEM);
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct cache_ent *ent = NULL;
+
+	if (in_size > MED_LIST_SIZE && in_size <= LONG_LIST_SIZE)
+		ent = &cmd->cache.large;
+	else if (in_size > 16 && in_size <= MED_LIST_SIZE)
+		ent = &cmd->cache.med;
+
+	if (ent) {
+		spin_lock(&ent->lock);
+		if (!list_empty(&ent->head)) {
+			msg = list_entry(ent->head.next, typeof(*msg), list);
+			/* For cached lists, we must explicitly state what is
+			 * the real size
+			 */
+			msg->len = in_size;
+			list_del(&msg->list);
+		}
+		spin_unlock(&ent->lock);
+	}
+
+	if (IS_ERR(msg))
+		msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, in_size);
+
+	return msg;
+}
+
+static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg)
+{
+	if (msg->cache) {
+		spin_lock(&msg->cache->lock);
+		list_add_tail(&msg->list, &msg->cache->head);
+		spin_unlock(&msg->cache->lock);
+	} else {
+		mlx5_free_cmd_msg(dev, msg);
+	}
+}
+
+static int is_manage_pages(struct mlx5_inbox_hdr *in)
+{
+	return be16_to_cpu(in->opcode) == MLX5_CMD_OP_MANAGE_PAGES;
+}
+
+int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+		  int out_size)
+{
+	struct mlx5_cmd_msg *inb;
+	struct mlx5_cmd_msg *outb;
+	int pages_queue;
+	int err;
+	u8 status = 0;
+
+	pages_queue = is_manage_pages(in);
+
+	inb = alloc_msg(dev, in_size);
+	if (IS_ERR(inb)) {
+		err = PTR_ERR(inb);
+		return err;
+	}
+
+	err = mlx5_copy_to_msg(inb, in, in_size);
+	if (err) {
+		mlx5_core_warn(dev, "err %d\n", err);
+		goto out_in;
+	}
+
+	outb = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, out_size);
+	if (IS_ERR(outb)) {
+		err = PTR_ERR(outb);
+		goto out_in;
+	}
+
+	err = mlx5_cmd_invoke(dev, inb, outb, NULL, NULL, pages_queue, &status);
+	if (err)
+		goto out_out;
+
+	mlx5_core_dbg(dev, "err %d, status %d\n", err, status);
+	if (status) {
+		err = status_to_err(status);
+		goto out_out;
+	}
+
+	err = mlx5_copy_from_msg(out, outb, out_size);
+
+out_out:
+	mlx5_free_cmd_msg(dev, outb);
+
+out_in:
+	free_msg(dev, inb);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_cmd_exec);
+
+static void destroy_msg_cache(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct mlx5_cmd_msg *msg;
+	struct mlx5_cmd_msg *n;
+
+	list_for_each_entry_safe(msg, n, &cmd->cache.large.head, list) {
+		list_del(&msg->list);
+		mlx5_free_cmd_msg(dev, msg);
+	}
+
+	list_for_each_entry_safe(msg, n, &cmd->cache.med.head, list) {
+		list_del(&msg->list);
+		mlx5_free_cmd_msg(dev, msg);
+	}
+}
+
+static int create_msg_cache(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+	struct mlx5_cmd_msg *msg;
+	int err;
+	int i;
+
+	spin_lock_init(&cmd->cache.large.lock);
+	INIT_LIST_HEAD(&cmd->cache.large.head);
+	spin_lock_init(&cmd->cache.med.lock);
+	INIT_LIST_HEAD(&cmd->cache.med.head);
+
+	for (i = 0; i < NUM_LONG_LISTS; i++) {
+		msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, LONG_LIST_SIZE);
+		if (IS_ERR(msg)) {
+			err = PTR_ERR(msg);
+			goto ex_err;
+		}
+		msg->cache = &cmd->cache.large;
+		list_add_tail(&msg->list, &cmd->cache.large.head);
+	}
+
+	for (i = 0; i < NUM_MED_LISTS; i++) {
+		msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, MED_LIST_SIZE);
+		if (IS_ERR(msg)) {
+			err = PTR_ERR(msg);
+			goto ex_err;
+		}
+		msg->cache = &cmd->cache.med;
+		list_add_tail(&msg->list, &cmd->cache.med.head);
+	}
+
+	return 0;
+
+ex_err:
+	destroy_msg_cache(dev);
+	return err;
+}
+
+int mlx5_cmd_init(struct mlx5_core_dev *dev)
+{
+	int size = sizeof(struct mlx5_cmd_prot_block);
+	int align = roundup_pow_of_two(size);
+	struct mlx5_cmd *cmd = &dev->cmd;
+	u32 cmd_h, cmd_l;
+	u16 cmd_if_rev;
+	int err;
+	int i;
+
+	cmd_if_rev = cmdif_rev(dev);
+	if (cmd_if_rev != CMD_IF_REV) {
+		dev_err(&dev->pdev->dev,
+			"Driver cmdif rev(%d) differs from firmware's(%d)\n",
+			CMD_IF_REV, cmd_if_rev);
+		return -EINVAL;
+	}
+
+	cmd->pool = pci_pool_create("mlx5_cmd", dev->pdev, size, align, 0);
+	if (!cmd->pool)
+		return -ENOMEM;
+
+	cmd->cmd_buf = (void *)__get_free_pages(GFP_ATOMIC, 0);
+	if (!cmd->cmd_buf) {
+		err = -ENOMEM;
+		goto err_free_pool;
+	}
+	cmd->dma = dma_map_single(&dev->pdev->dev, cmd->cmd_buf, PAGE_SIZE,
+				  DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(&dev->pdev->dev, cmd->dma)) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	cmd_l = ioread32be(&dev->iseg->cmdq_addr_l_sz) & 0xff;
+	cmd->log_sz = cmd_l >> 4 & 0xf;
+	cmd->log_stride = cmd_l & 0xf;
+	if (1 << cmd->log_sz > MLX5_MAX_COMMANDS) {
+		dev_err(&dev->pdev->dev, "firmware reports too many outstanding commands %d\n",
+			1 << cmd->log_sz);
+		err = -EINVAL;
+		goto err_map;
+	}
+
+	if (cmd->log_sz + cmd->log_stride > PAGE_SHIFT) {
+		dev_err(&dev->pdev->dev, "command queue size overflow\n");
+		err = -EINVAL;
+		goto err_map;
+	}
+
+	cmd->max_reg_cmds = (1 << cmd->log_sz) - 1;
+	cmd->bitmask = (1 << cmd->max_reg_cmds) - 1;
+
+	cmd->cmdif_rev = ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16;
+	if (cmd->cmdif_rev > CMD_IF_REV) {
+		dev_err(&dev->pdev->dev, "driver does not support command interface version. driver %d, firmware %d\n",
+			CMD_IF_REV, cmd->cmdif_rev);
+		err = -ENOTSUPP;
+		goto err_map;
+	}
+
+	spin_lock_init(&cmd->alloc_lock);
+	spin_lock_init(&cmd->token_lock);
+	for (i = 0; i < ARRAY_SIZE(cmd->stats); i++)
+		spin_lock_init(&cmd->stats[i].lock);
+
+	sema_init(&cmd->sem, cmd->max_reg_cmds);
+	sema_init(&cmd->pages_sem, 1);
+
+	cmd_h = (u32)((u64)(cmd->dma) >> 32);
+	cmd_l = (u32)(cmd->dma);
+	if (cmd_l & 0xfff) {
+		dev_err(&dev->pdev->dev, "invalid command queue address\n");
+		err = -ENOMEM;
+		goto err_map;
+	}
+
+	iowrite32be(cmd_h, &dev->iseg->cmdq_addr_h);
+	iowrite32be(cmd_l, &dev->iseg->cmdq_addr_l_sz);
+
+	/* Make sure firmware sees the complete address before we proceed */
+	wmb();
+
+	mlx5_core_dbg(dev, "descriptor at dma 0x%llx\n", (unsigned long long)(cmd->dma));
+
+	cmd->mode = CMD_MODE_POLLING;
+
+	err = create_msg_cache(dev);
+	if (err) {
+		dev_err(&dev->pdev->dev, "failed to create command cache\n");
+		goto err_map;
+	}
+
+	set_wqname(dev);
+	cmd->wq = create_singlethread_workqueue(cmd->wq_name);
+	if (!cmd->wq) {
+		dev_err(&dev->pdev->dev, "failed to create command workqueue\n");
+		err = -ENOMEM;
+		goto err_cache;
+	}
+
+	err = create_debugfs_files(dev);
+	if (err) {
+		err = -ENOMEM;
+		goto err_wq;
+	}
+
+	return 0;
+
+err_wq:
+	destroy_workqueue(cmd->wq);
+
+err_cache:
+	destroy_msg_cache(dev);
+
+err_map:
+	dma_unmap_single(&dev->pdev->dev, cmd->dma, PAGE_SIZE,
+			 DMA_BIDIRECTIONAL);
+err_free:
+	free_pages((unsigned long)cmd->cmd_buf, 0);
+
+err_free_pool:
+	pci_pool_destroy(cmd->pool);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_cmd_init);
+
+void mlx5_cmd_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd *cmd = &dev->cmd;
+
+	clean_debug_files(dev);
+	destroy_workqueue(cmd->wq);
+	destroy_msg_cache(dev);
+	dma_unmap_single(&dev->pdev->dev, cmd->dma, PAGE_SIZE,
+			 DMA_BIDIRECTIONAL);
+	free_pages((unsigned long)cmd->cmd_buf, 0);
+	pci_pool_destroy(cmd->pool);
+}
+EXPORT_SYMBOL(mlx5_cmd_cleanup);
+
+static const char *cmd_status_str(u8 status)
+{
+	switch (status) {
+	case MLX5_CMD_STAT_OK:
+		return "OK";
+	case MLX5_CMD_STAT_INT_ERR:
+		return "internal error";
+	case MLX5_CMD_STAT_BAD_OP_ERR:
+		return "bad operation";
+	case MLX5_CMD_STAT_BAD_PARAM_ERR:
+		return "bad parameter";
+	case MLX5_CMD_STAT_BAD_SYS_STATE_ERR:
+		return "bad system state";
+	case MLX5_CMD_STAT_BAD_RES_ERR:
+		return "bad resource";
+	case MLX5_CMD_STAT_RES_BUSY:
+		return "resource busy";
+	case MLX5_CMD_STAT_LIM_ERR:
+		return "limits exceeded";
+	case MLX5_CMD_STAT_BAD_RES_STATE_ERR:
+		return "bad resource state";
+	case MLX5_CMD_STAT_IX_ERR:
+		return "bad index";
+	case MLX5_CMD_STAT_NO_RES_ERR:
+		return "no resources";
+	case MLX5_CMD_STAT_BAD_INP_LEN_ERR:
+		return "bad input length";
+	case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR:
+		return "bad output length";
+	case MLX5_CMD_STAT_BAD_QP_STATE_ERR:
+		return "bad QP state";
+	case MLX5_CMD_STAT_BAD_PKT_ERR:
+		return "bad packet (discarded)";
+	case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR:
+		return "bad size too many outstanding CQEs";
+	default:
+		return "unknown status";
+	}
+}
+
+int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr)
+{
+	if (!hdr->status)
+		return 0;
+
+	pr_warn("command failed, status %s(0x%x), syndrome 0x%x\n",
+		cmd_status_str(hdr->status), hdr->status,
+		be32_to_cpu(hdr->syndrome));
+
+	switch (hdr->status) {
+	case MLX5_CMD_STAT_OK:				return 0;
+	case MLX5_CMD_STAT_INT_ERR:			return -EIO;
+	case MLX5_CMD_STAT_BAD_OP_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_BAD_PARAM_ERR:		return -EINVAL;
+	case MLX5_CMD_STAT_BAD_SYS_STATE_ERR:		return -EIO;
+	case MLX5_CMD_STAT_BAD_RES_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_RES_BUSY:			return -EBUSY;
+	case MLX5_CMD_STAT_LIM_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_BAD_RES_STATE_ERR:		return -EINVAL;
+	case MLX5_CMD_STAT_IX_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_NO_RES_ERR:			return -EAGAIN;
+	case MLX5_CMD_STAT_BAD_INP_LEN_ERR:		return -EIO;
+	case MLX5_CMD_STAT_BAD_OUTP_LEN_ERR:		return -EIO;
+	case MLX5_CMD_STAT_BAD_QP_STATE_ERR:		return -EINVAL;
+	case MLX5_CMD_STAT_BAD_PKT_ERR:			return -EINVAL;
+	case MLX5_CMD_STAT_BAD_SIZE_OUTS_CQES_ERR:	return -EINVAL;
+	default:					return -EIO;
+	}
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
new file mode 100644
index 000000000000..c2d660be6f76
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/hardirq.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <rdma/ib_verbs.h>
+#include <linux/mlx5/cq.h>
+#include "mlx5_core.h"
+
+void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn)
+{
+	struct mlx5_core_cq *cq;
+	struct mlx5_cq_table *table = &dev->priv.cq_table;
+
+	spin_lock(&table->lock);
+	cq = radix_tree_lookup(&table->tree, cqn);
+	if (likely(cq))
+		atomic_inc(&cq->refcount);
+	spin_unlock(&table->lock);
+
+	if (!cq) {
+		mlx5_core_warn(dev, "Completion event for bogus CQ 0x%x\n", cqn);
+		return;
+	}
+
+	++cq->arm_sn;
+
+	cq->comp(cq);
+
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+}
+
+void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type)
+{
+	struct mlx5_cq_table *table = &dev->priv.cq_table;
+	struct mlx5_core_cq *cq;
+
+	spin_lock(&table->lock);
+
+	cq = radix_tree_lookup(&table->tree, cqn);
+	if (cq)
+		atomic_inc(&cq->refcount);
+
+	spin_unlock(&table->lock);
+
+	if (!cq) {
+		mlx5_core_warn(dev, "Async event for bogus CQ 0x%x\n", cqn);
+		return;
+	}
+
+	cq->event(cq, event_type);
+
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+}
+
+
+int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			struct mlx5_create_cq_mbox_in *in, int inlen)
+{
+	int err;
+	struct mlx5_cq_table *table = &dev->priv.cq_table;
+	struct mlx5_create_cq_mbox_out out;
+	struct mlx5_destroy_cq_mbox_in din;
+	struct mlx5_destroy_cq_mbox_out dout;
+
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_CQ);
+	memset(&out, 0, sizeof(out));
+	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	cq->cqn = be32_to_cpu(out.cqn) & 0xffffff;
+	cq->cons_index = 0;
+	cq->arm_sn     = 0;
+	atomic_set(&cq->refcount, 1);
+	init_completion(&cq->free);
+
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree, cq->cqn, cq);
+	spin_unlock_irq(&table->lock);
+	if (err)
+		goto err_cmd;
+
+	cq->pid = current->pid;
+	err = mlx5_debug_cq_add(dev, cq);
+	if (err)
+		mlx5_core_dbg(dev, "failed adding CP 0x%x to debug file system\n",
+			      cq->cqn);
+
+	return 0;
+
+err_cmd:
+	memset(&din, 0, sizeof(din));
+	memset(&dout, 0, sizeof(dout));
+	din.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_CQ);
+	mlx5_cmd_exec(dev, &din, sizeof(din), &dout, sizeof(dout));
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_cq);
+
+int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
+{
+	struct mlx5_cq_table *table = &dev->priv.cq_table;
+	struct mlx5_destroy_cq_mbox_in in;
+	struct mlx5_destroy_cq_mbox_out out;
+	struct mlx5_core_cq *tmp;
+	int err;
+
+	spin_lock_irq(&table->lock);
+	tmp = radix_tree_delete(&table->tree, cq->cqn);
+	spin_unlock_irq(&table->lock);
+	if (!tmp) {
+		mlx5_core_warn(dev, "cq 0x%x not found in tree\n", cq->cqn);
+		return -EINVAL;
+	}
+	if (tmp != cq) {
+		mlx5_core_warn(dev, "corruption on srqn 0x%x\n", cq->cqn);
+		return -EINVAL;
+	}
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_CQ);
+	in.cqn = cpu_to_be32(cq->cqn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	synchronize_irq(cq->irqn);
+
+	mlx5_debug_cq_remove(dev, cq);
+	if (atomic_dec_and_test(&cq->refcount))
+		complete(&cq->free);
+	wait_for_completion(&cq->free);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_core_destroy_cq);
+
+int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+		       struct mlx5_query_cq_mbox_out *out)
+{
+	struct mlx5_query_cq_mbox_in in;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(out, 0, sizeof(*out));
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_CQ);
+	in.cqn = cpu_to_be32(cq->cqn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out));
+	if (err)
+		return err;
+
+	if (out->hdr.status)
+		return mlx5_cmd_status_to_err(&out->hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_query_cq);
+
+
+int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			int type, struct mlx5_cq_modify_params *params)
+{
+	return -ENOSYS;
+}
+
+int mlx5_init_cq_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cq_table *table = &dev->priv.cq_table;
+	int err;
+
+	spin_lock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+	err = mlx5_cq_debugfs_init(dev);
+
+	return err;
+}
+
+void mlx5_cleanup_cq_table(struct mlx5_core_dev *dev)
+{
+	mlx5_cq_debugfs_cleanup(dev);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
new file mode 100644
index 000000000000..5e9cf2b9aaf7
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c
@@ -0,0 +1,587 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/driver.h>
+#include "mlx5_core.h"
+
+enum {
+	QP_PID,
+	QP_STATE,
+	QP_XPORT,
+	QP_MTU,
+	QP_N_RECV,
+	QP_RECV_SZ,
+	QP_N_SEND,
+	QP_LOG_PG_SZ,
+	QP_RQPN,
+};
+
+static char *qp_fields[] = {
+	[QP_PID]	= "pid",
+	[QP_STATE]	= "state",
+	[QP_XPORT]	= "transport",
+	[QP_MTU]	= "mtu",
+	[QP_N_RECV]	= "num_recv",
+	[QP_RECV_SZ]	= "rcv_wqe_sz",
+	[QP_N_SEND]	= "num_send",
+	[QP_LOG_PG_SZ]	= "log2_page_sz",
+	[QP_RQPN]	= "remote_qpn",
+};
+
+enum {
+	EQ_NUM_EQES,
+	EQ_INTR,
+	EQ_LOG_PG_SZ,
+};
+
+static char *eq_fields[] = {
+	[EQ_NUM_EQES]	= "num_eqes",
+	[EQ_INTR]	= "intr",
+	[EQ_LOG_PG_SZ]	= "log_page_size",
+};
+
+enum {
+	CQ_PID,
+	CQ_NUM_CQES,
+	CQ_LOG_PG_SZ,
+};
+
+static char *cq_fields[] = {
+	[CQ_PID]	= "pid",
+	[CQ_NUM_CQES]	= "num_cqes",
+	[CQ_LOG_PG_SZ]	= "log_page_size",
+};
+
+struct dentry *mlx5_debugfs_root;
+EXPORT_SYMBOL(mlx5_debugfs_root);
+
+void mlx5_register_debugfs(void)
+{
+	mlx5_debugfs_root = debugfs_create_dir("mlx5", NULL);
+	if (IS_ERR_OR_NULL(mlx5_debugfs_root))
+		mlx5_debugfs_root = NULL;
+}
+
+void mlx5_unregister_debugfs(void)
+{
+	debugfs_remove(mlx5_debugfs_root);
+}
+
+int mlx5_qp_debugfs_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	atomic_set(&dev->num_qps, 0);
+
+	dev->priv.qp_debugfs = debugfs_create_dir("QPs",  dev->priv.dbg_root);
+	if (!dev->priv.qp_debugfs)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.qp_debugfs);
+}
+
+int mlx5_eq_debugfs_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dev->priv.eq_debugfs = debugfs_create_dir("EQs",  dev->priv.dbg_root);
+	if (!dev->priv.eq_debugfs)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.eq_debugfs);
+}
+
+static ssize_t average_read(struct file *filp, char __user *buf, size_t count,
+			    loff_t *pos)
+{
+	struct mlx5_cmd_stats *stats;
+	u64 field = 0;
+	int ret;
+	int err;
+	char tbuf[22];
+
+	if (*pos)
+		return 0;
+
+	stats = filp->private_data;
+	spin_lock(&stats->lock);
+	if (stats->n)
+		field = stats->sum / stats->n;
+	spin_unlock(&stats->lock);
+	ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field);
+	if (ret > 0) {
+		err = copy_to_user(buf, tbuf, ret);
+		if (err)
+			return err;
+	}
+
+	*pos += ret;
+	return ret;
+}
+
+
+static ssize_t average_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct mlx5_cmd_stats *stats;
+
+	stats = filp->private_data;
+	spin_lock(&stats->lock);
+	stats->sum = 0;
+	stats->n = 0;
+	spin_unlock(&stats->lock);
+
+	*pos += count;
+
+	return count;
+}
+
+static const struct file_operations stats_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.read	= average_read,
+	.write	= average_write,
+};
+
+int mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_stats *stats;
+	struct dentry **cmd;
+	const char *namep;
+	int err;
+	int i;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	cmd = &dev->priv.cmdif_debugfs;
+	*cmd = debugfs_create_dir("commands", dev->priv.dbg_root);
+	if (!*cmd)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(dev->cmd.stats); i++) {
+		stats = &dev->cmd.stats[i];
+		namep = mlx5_command_str(i);
+		if (strcmp(namep, "unknown command opcode")) {
+			stats->root = debugfs_create_dir(namep, *cmd);
+			if (!stats->root) {
+				mlx5_core_warn(dev, "failed adding command %d\n",
+					       i);
+				err = -ENOMEM;
+				goto out;
+			}
+
+			stats->avg = debugfs_create_file("average", 0400,
+							 stats->root, stats,
+							 &stats_fops);
+			if (!stats->avg) {
+				mlx5_core_warn(dev, "failed creating debugfs file\n");
+				err = -ENOMEM;
+				goto out;
+			}
+
+			stats->count = debugfs_create_u64("n", 0400,
+							  stats->root,
+							  &stats->n);
+			if (!stats->count) {
+				mlx5_core_warn(dev, "failed creating debugfs file\n");
+				err = -ENOMEM;
+				goto out;
+			}
+		}
+	}
+
+	return 0;
+out:
+	debugfs_remove_recursive(dev->priv.cmdif_debugfs);
+	return err;
+}
+
+void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.cmdif_debugfs);
+}
+
+int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	dev->priv.cq_debugfs = debugfs_create_dir("CQs",  dev->priv.dbg_root);
+	if (!dev->priv.cq_debugfs)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->priv.cq_debugfs);
+}
+
+static u64 qp_read_field(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp,
+			 int index)
+{
+	struct mlx5_query_qp_mbox_out *out;
+	struct mlx5_qp_context *ctx;
+	u64 param = 0;
+	int err;
+	int no_sq;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return param;
+
+	err = mlx5_core_qp_query(dev, qp, out, sizeof(*out));
+	if (err) {
+		mlx5_core_warn(dev, "failed to query qp\n");
+		goto out;
+	}
+
+	ctx = &out->ctx;
+	switch (index) {
+	case QP_PID:
+		param = qp->pid;
+		break;
+	case QP_STATE:
+		param = be32_to_cpu(ctx->flags) >> 28;
+		break;
+	case QP_XPORT:
+		param = (be32_to_cpu(ctx->flags) >> 16) & 0xff;
+		break;
+	case QP_MTU:
+		param = ctx->mtu_msgmax >> 5;
+		break;
+	case QP_N_RECV:
+		param = 1 << ((ctx->rq_size_stride >> 3) & 0xf);
+		break;
+	case QP_RECV_SZ:
+		param = 1 << ((ctx->rq_size_stride & 7) + 4);
+		break;
+	case QP_N_SEND:
+		no_sq = be16_to_cpu(ctx->sq_crq_size) >> 15;
+		if (!no_sq)
+			param = 1 << (be16_to_cpu(ctx->sq_crq_size) >> 11);
+		else
+			param = 0;
+		break;
+	case QP_LOG_PG_SZ:
+		param = ((cpu_to_be32(ctx->log_pg_sz_remote_qpn) >> 24) & 0x1f);
+		param += 12;
+		break;
+	case QP_RQPN:
+		param = cpu_to_be32(ctx->log_pg_sz_remote_qpn) & 0xffffff;
+		break;
+	}
+
+out:
+	kfree(out);
+	return param;
+}
+
+static u64 eq_read_field(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+			 int index)
+{
+	struct mlx5_query_eq_mbox_out *out;
+	struct mlx5_eq_context *ctx;
+	u64 param = 0;
+	int err;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return param;
+
+	ctx = &out->ctx;
+
+	err = mlx5_core_eq_query(dev, eq, out, sizeof(*out));
+	if (err) {
+		mlx5_core_warn(dev, "failed to query eq\n");
+		goto out;
+	}
+
+	switch (index) {
+	case EQ_NUM_EQES:
+		param = 1 << ((be32_to_cpu(ctx->log_sz_usr_page) >> 24) & 0x1f);
+		break;
+	case EQ_INTR:
+		param = ctx->intr;
+		break;
+	case EQ_LOG_PG_SZ:
+		param = (ctx->log_page_size & 0x1f) + 12;
+		break;
+	}
+
+out:
+	kfree(out);
+	return param;
+}
+
+static u64 cq_read_field(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			 int index)
+{
+	struct mlx5_query_cq_mbox_out *out;
+	struct mlx5_cq_context *ctx;
+	u64 param = 0;
+	int err;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return param;
+
+	ctx = &out->ctx;
+
+	err = mlx5_core_query_cq(dev, cq, out);
+	if (err) {
+		mlx5_core_warn(dev, "failed to query cq\n");
+		goto out;
+	}
+
+	switch (index) {
+	case CQ_PID:
+		param = cq->pid;
+		break;
+	case CQ_NUM_CQES:
+		param = 1 << ((be32_to_cpu(ctx->log_sz_usr_page) >> 24) & 0x1f);
+		break;
+	case CQ_LOG_PG_SZ:
+		param = (ctx->log_pg_sz & 0x1f) + 12;
+		break;
+	}
+
+out:
+	kfree(out);
+	return param;
+}
+
+static ssize_t dbg_read(struct file *filp, char __user *buf, size_t count,
+			loff_t *pos)
+{
+	struct mlx5_field_desc *desc;
+	struct mlx5_rsc_debug *d;
+	char tbuf[18];
+	u64 field;
+	int ret;
+	int err;
+
+	if (*pos)
+		return 0;
+
+	desc = filp->private_data;
+	d = (void *)(desc - desc->i) - sizeof(*d);
+	switch (d->type) {
+	case MLX5_DBG_RSC_QP:
+		field = qp_read_field(d->dev, d->object, desc->i);
+		break;
+
+	case MLX5_DBG_RSC_EQ:
+		field = eq_read_field(d->dev, d->object, desc->i);
+		break;
+
+	case MLX5_DBG_RSC_CQ:
+		field = cq_read_field(d->dev, d->object, desc->i);
+		break;
+
+	default:
+		mlx5_core_warn(d->dev, "invalid resource type %d\n", d->type);
+		return -EINVAL;
+	}
+
+	ret = snprintf(tbuf, sizeof(tbuf), "0x%llx\n", field);
+	if (ret > 0) {
+		err = copy_to_user(buf, tbuf, ret);
+		if (err)
+			return err;
+	}
+
+	*pos += ret;
+	return ret;
+}
+
+static const struct file_operations fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.read	= dbg_read,
+};
+
+static int add_res_tree(struct mlx5_core_dev *dev, enum dbg_rsc_type type,
+			struct dentry *root, struct mlx5_rsc_debug **dbg,
+			int rsn, char **field, int nfile, void *data)
+{
+	struct mlx5_rsc_debug *d;
+	char resn[32];
+	int err;
+	int i;
+
+	d = kzalloc(sizeof(*d) + nfile * sizeof(d->fields[0]), GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	d->dev = dev;
+	d->object = data;
+	d->type = type;
+	sprintf(resn, "0x%x", rsn);
+	d->root = debugfs_create_dir(resn,  root);
+	if (!d->root) {
+		err = -ENOMEM;
+		goto out_free;
+	}
+
+	for (i = 0; i < nfile; i++) {
+		d->fields[i].i = i;
+		d->fields[i].dent = debugfs_create_file(field[i], 0400,
+							d->root, &d->fields[i],
+							&fops);
+		if (!d->fields[i].dent) {
+			err = -ENOMEM;
+			goto out_rem;
+		}
+	}
+	*dbg = d;
+
+	return 0;
+out_rem:
+	debugfs_remove_recursive(d->root);
+
+out_free:
+	kfree(d);
+	return err;
+}
+
+static void rem_res_tree(struct mlx5_rsc_debug *d)
+{
+	debugfs_remove_recursive(d->root);
+	kfree(d);
+}
+
+int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp)
+{
+	int err;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	err = add_res_tree(dev, MLX5_DBG_RSC_QP, dev->priv.qp_debugfs,
+			   &qp->dbg, qp->qpn, qp_fields,
+			   ARRAY_SIZE(qp_fields), qp);
+	if (err)
+		qp->dbg = NULL;
+
+	return err;
+}
+
+void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	if (qp->dbg)
+		rem_res_tree(qp->dbg);
+}
+
+
+int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	int err;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	err = add_res_tree(dev, MLX5_DBG_RSC_EQ, dev->priv.eq_debugfs,
+			   &eq->dbg, eq->eqn, eq_fields,
+			   ARRAY_SIZE(eq_fields), eq);
+	if (err)
+		eq->dbg = NULL;
+
+	return err;
+}
+
+void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	if (eq->dbg)
+		rem_res_tree(eq->dbg);
+}
+
+int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
+{
+	int err;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	err = add_res_tree(dev, MLX5_DBG_RSC_CQ, dev->priv.cq_debugfs,
+			   &cq->dbg, cq->cqn, cq_fields,
+			   ARRAY_SIZE(cq_fields), cq);
+	if (err)
+		cq->dbg = NULL;
+
+	return err;
+}
+
+void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	if (cq->dbg)
+		rem_res_tree(cq->dbg);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
new file mode 100644
index 000000000000..c02cbcfd0fb8
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+enum {
+	MLX5_EQE_SIZE		= sizeof(struct mlx5_eqe),
+	MLX5_EQE_OWNER_INIT_VAL	= 0x1,
+};
+
+enum {
+	MLX5_EQ_STATE_ARMED		= 0x9,
+	MLX5_EQ_STATE_FIRED		= 0xa,
+	MLX5_EQ_STATE_ALWAYS_ARMED	= 0xb,
+};
+
+enum {
+	MLX5_NUM_SPARE_EQE	= 0x80,
+	MLX5_NUM_ASYNC_EQE	= 0x100,
+	MLX5_NUM_CMD_EQE	= 32,
+};
+
+enum {
+	MLX5_EQ_DOORBEL_OFFSET	= 0x40,
+};
+
+#define MLX5_ASYNC_EVENT_MASK ((1ull << MLX5_EVENT_TYPE_PATH_MIG)	    | \
+			       (1ull << MLX5_EVENT_TYPE_COMM_EST)	    | \
+			       (1ull << MLX5_EVENT_TYPE_SQ_DRAINED)	    | \
+			       (1ull << MLX5_EVENT_TYPE_CQ_ERROR)	    | \
+			       (1ull << MLX5_EVENT_TYPE_WQ_CATAS_ERROR)	    | \
+			       (1ull << MLX5_EVENT_TYPE_PATH_MIG_FAILED)    | \
+			       (1ull << MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+			       (1ull << MLX5_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+			       (1ull << MLX5_EVENT_TYPE_PORT_CHANGE)	    | \
+			       (1ull << MLX5_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+			       (1ull << MLX5_EVENT_TYPE_SRQ_LAST_WQE)	    | \
+			       (1ull << MLX5_EVENT_TYPE_SRQ_RQ_LIMIT))
+
+struct map_eq_in {
+	u64	mask;
+	u32	reserved;
+	u32	unmap_eqn;
+};
+
+struct cre_des_eq {
+	u8	reserved[15];
+	u8	eqn;
+};
+
+static int mlx5_cmd_destroy_eq(struct mlx5_core_dev *dev, u8 eqn)
+{
+	struct mlx5_destroy_eq_mbox_in in;
+	struct mlx5_destroy_eq_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_EQ);
+	in.eqn = eqn;
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (!err)
+		goto ex;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+ex:
+	return err;
+}
+
+static struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, u32 entry)
+{
+	return mlx5_buf_offset(&eq->buf, entry * MLX5_EQE_SIZE);
+}
+
+static struct mlx5_eqe *next_eqe_sw(struct mlx5_eq *eq)
+{
+	struct mlx5_eqe *eqe = get_eqe(eq, eq->cons_index & (eq->nent - 1));
+
+	return ((eqe->owner & 1) ^ !!(eq->cons_index & eq->nent)) ? NULL : eqe;
+}
+
+static const char *eqe_type_str(u8 type)
+{
+	switch (type) {
+	case MLX5_EVENT_TYPE_COMP:
+		return "MLX5_EVENT_TYPE_COMP";
+	case MLX5_EVENT_TYPE_PATH_MIG:
+		return "MLX5_EVENT_TYPE_PATH_MIG";
+	case MLX5_EVENT_TYPE_COMM_EST:
+		return "MLX5_EVENT_TYPE_COMM_EST";
+	case MLX5_EVENT_TYPE_SQ_DRAINED:
+		return "MLX5_EVENT_TYPE_SQ_DRAINED";
+	case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+		return "MLX5_EVENT_TYPE_SRQ_LAST_WQE";
+	case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+		return "MLX5_EVENT_TYPE_SRQ_RQ_LIMIT";
+	case MLX5_EVENT_TYPE_CQ_ERROR:
+		return "MLX5_EVENT_TYPE_CQ_ERROR";
+	case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+		return "MLX5_EVENT_TYPE_WQ_CATAS_ERROR";
+	case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+		return "MLX5_EVENT_TYPE_PATH_MIG_FAILED";
+	case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		return "MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR";
+	case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+		return "MLX5_EVENT_TYPE_WQ_ACCESS_ERROR";
+	case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+		return "MLX5_EVENT_TYPE_SRQ_CATAS_ERROR";
+	case MLX5_EVENT_TYPE_INTERNAL_ERROR:
+		return "MLX5_EVENT_TYPE_INTERNAL_ERROR";
+	case MLX5_EVENT_TYPE_PORT_CHANGE:
+		return "MLX5_EVENT_TYPE_PORT_CHANGE";
+	case MLX5_EVENT_TYPE_GPIO_EVENT:
+		return "MLX5_EVENT_TYPE_GPIO_EVENT";
+	case MLX5_EVENT_TYPE_REMOTE_CONFIG:
+		return "MLX5_EVENT_TYPE_REMOTE_CONFIG";
+	case MLX5_EVENT_TYPE_DB_BF_CONGESTION:
+		return "MLX5_EVENT_TYPE_DB_BF_CONGESTION";
+	case MLX5_EVENT_TYPE_STALL_EVENT:
+		return "MLX5_EVENT_TYPE_STALL_EVENT";
+	case MLX5_EVENT_TYPE_CMD:
+		return "MLX5_EVENT_TYPE_CMD";
+	case MLX5_EVENT_TYPE_PAGE_REQUEST:
+		return "MLX5_EVENT_TYPE_PAGE_REQUEST";
+	default:
+		return "Unrecognized event";
+	}
+}
+
+static enum mlx5_dev_event port_subtype_event(u8 subtype)
+{
+	switch (subtype) {
+	case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+		return MLX5_DEV_EVENT_PORT_DOWN;
+	case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+		return MLX5_DEV_EVENT_PORT_UP;
+	case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
+		return MLX5_DEV_EVENT_PORT_INITIALIZED;
+	case MLX5_PORT_CHANGE_SUBTYPE_LID:
+		return MLX5_DEV_EVENT_LID_CHANGE;
+	case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
+		return MLX5_DEV_EVENT_PKEY_CHANGE;
+	case MLX5_PORT_CHANGE_SUBTYPE_GUID:
+		return MLX5_DEV_EVENT_GUID_CHANGE;
+	case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
+		return MLX5_DEV_EVENT_CLIENT_REREG;
+	}
+	return -1;
+}
+
+static void eq_update_ci(struct mlx5_eq *eq, int arm)
+{
+	__be32 __iomem *addr = eq->doorbell + (arm ? 0 : 2);
+	u32 val = (eq->cons_index & 0xffffff) | (eq->eqn << 24);
+	__raw_writel((__force u32) cpu_to_be32(val), addr);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	struct mlx5_eqe *eqe;
+	int eqes_found = 0;
+	int set_ci = 0;
+	u32 cqn;
+	u32 srqn;
+	u8 port;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		rmb();
+
+		mlx5_core_dbg(eq->dev, "eqn %d, eqe type %s\n", eq->eqn, eqe_type_str(eqe->type));
+		switch (eqe->type) {
+		case MLX5_EVENT_TYPE_COMP:
+			cqn = be32_to_cpu(eqe->data.comp.cqn) & 0xffffff;
+			mlx5_cq_completion(dev, cqn);
+			break;
+
+		case MLX5_EVENT_TYPE_PATH_MIG:
+		case MLX5_EVENT_TYPE_COMM_EST:
+		case MLX5_EVENT_TYPE_SQ_DRAINED:
+		case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+		case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+		case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+		case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+		case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+			mlx5_core_dbg(dev, "event %s(%d) arrived\n",
+				      eqe_type_str(eqe->type), eqe->type);
+			mlx5_qp_event(dev, be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff,
+				      eqe->type);
+			break;
+
+		case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+		case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+			srqn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
+			mlx5_core_dbg(dev, "SRQ event %s(%d): srqn 0x%x\n",
+				      eqe_type_str(eqe->type), eqe->type, srqn);
+			mlx5_srq_event(dev, srqn, eqe->type);
+			break;
+
+		case MLX5_EVENT_TYPE_CMD:
+			mlx5_cmd_comp_handler(dev, be32_to_cpu(eqe->data.cmd.vector));
+			break;
+
+		case MLX5_EVENT_TYPE_PORT_CHANGE:
+			port = (eqe->data.port.port >> 4) & 0xf;
+			switch (eqe->sub_type) {
+			case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
+			case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
+			case MLX5_PORT_CHANGE_SUBTYPE_LID:
+			case MLX5_PORT_CHANGE_SUBTYPE_PKEY:
+			case MLX5_PORT_CHANGE_SUBTYPE_GUID:
+			case MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG:
+			case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED:
+				dev->event(dev, port_subtype_event(eqe->sub_type), &port);
+				break;
+			default:
+				mlx5_core_warn(dev, "Port event with unrecognized subtype: port %d, sub_type %d\n",
+					       port, eqe->sub_type);
+			}
+			break;
+		case MLX5_EVENT_TYPE_CQ_ERROR:
+			cqn = be32_to_cpu(eqe->data.cq_err.cqn) & 0xffffff;
+			mlx5_core_warn(dev, "CQ error on CQN 0x%x, syndrom 0x%x\n",
+				       cqn, eqe->data.cq_err.syndrome);
+			mlx5_cq_event(dev, cqn, eqe->type);
+			break;
+
+		case MLX5_EVENT_TYPE_PAGE_REQUEST:
+			{
+				u16 func_id = be16_to_cpu(eqe->data.req_pages.func_id);
+				s16 npages = be16_to_cpu(eqe->data.req_pages.num_pages);
+
+				mlx5_core_dbg(dev, "page request for func 0x%x, napges %d\n", func_id, npages);
+				mlx5_core_req_pages_handler(dev, func_id, npages);
+			}
+			break;
+
+
+		default:
+			mlx5_core_warn(dev, "Unhandled event 0x%x on EQ 0x%x\n", eqe->type, eq->eqn);
+			break;
+		}
+
+		++eq->cons_index;
+		eqes_found = 1;
+		++set_ci;
+
+		/* The HCA will think the queue has overflowed if we
+		 * don't tell it we've been processing events.  We
+		 * create our EQs with MLX5_NUM_SPARE_EQE extra
+		 * entries, so we must update our consumer index at
+		 * least that often.
+		 */
+		if (unlikely(set_ci >= MLX5_NUM_SPARE_EQE)) {
+			eq_update_ci(eq, 0);
+			set_ci = 0;
+		}
+	}
+
+	eq_update_ci(eq, 1);
+
+	return eqes_found;
+}
+
+static irqreturn_t mlx5_msix_handler(int irq, void *eq_ptr)
+{
+	struct mlx5_eq *eq = eq_ptr;
+	struct mlx5_core_dev *dev = eq->dev;
+
+	mlx5_eq_int(dev, eq);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static void init_eq_buf(struct mlx5_eq *eq)
+{
+	struct mlx5_eqe *eqe;
+	int i;
+
+	for (i = 0; i < eq->nent; i++) {
+		eqe = get_eqe(eq, i);
+		eqe->owner = MLX5_EQE_OWNER_INIT_VAL;
+	}
+}
+
+int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
+		       int nent, u64 mask, const char *name, struct mlx5_uar *uar)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	struct mlx5_create_eq_mbox_in *in;
+	struct mlx5_create_eq_mbox_out out;
+	int err;
+	int inlen;
+
+	eq->nent = roundup_pow_of_two(nent + MLX5_NUM_SPARE_EQE);
+	err = mlx5_buf_alloc(dev, eq->nent * MLX5_EQE_SIZE, 2 * PAGE_SIZE,
+			     &eq->buf);
+	if (err)
+		return err;
+
+	init_eq_buf(eq);
+
+	inlen = sizeof(*in) + sizeof(in->pas[0]) * eq->buf.npages;
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+	memset(&out, 0, sizeof(out));
+
+	mlx5_fill_page_array(&eq->buf, in->pas);
+
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_EQ);
+	in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(eq->nent) << 24 | uar->index);
+	in->ctx.intr = vecidx;
+	in->ctx.log_page_size = PAGE_SHIFT - 12;
+	in->events_mask = cpu_to_be64(mask);
+
+	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	if (err)
+		goto err_in;
+
+	if (out.hdr.status) {
+		err = mlx5_cmd_status_to_err(&out.hdr);
+		goto err_in;
+	}
+
+	eq->eqn = out.eq_number;
+	err = request_irq(table->msix_arr[vecidx].vector, mlx5_msix_handler, 0,
+			  name, eq);
+	if (err)
+		goto err_eq;
+
+	eq->irqn = vecidx;
+	eq->dev = dev;
+	eq->doorbell = uar->map + MLX5_EQ_DOORBEL_OFFSET;
+
+	err = mlx5_debug_eq_add(dev, eq);
+	if (err)
+		goto err_irq;
+
+	/* EQs are created in ARMED state
+	 */
+	eq_update_ci(eq, 1);
+
+	mlx5_vfree(in);
+	return 0;
+
+err_irq:
+	free_irq(table->msix_arr[vecidx].vector, eq);
+
+err_eq:
+	mlx5_cmd_destroy_eq(dev, eq->eqn);
+
+err_in:
+	mlx5_vfree(in);
+
+err_buf:
+	mlx5_buf_free(dev, &eq->buf);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_create_map_eq);
+
+int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	int err;
+
+	mlx5_debug_eq_remove(dev, eq);
+	free_irq(table->msix_arr[eq->irqn].vector, eq);
+	err = mlx5_cmd_destroy_eq(dev, eq->eqn);
+	if (err)
+		mlx5_core_warn(dev, "failed to destroy a previously created eq: eqn %d\n",
+			       eq->eqn);
+	mlx5_buf_free(dev, &eq->buf);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_destroy_unmap_eq);
+
+int mlx5_eq_init(struct mlx5_core_dev *dev)
+{
+	int err;
+
+	spin_lock_init(&dev->priv.eq_table.lock);
+
+	err = mlx5_eq_debugfs_init(dev);
+
+	return err;
+}
+
+
+void mlx5_eq_cleanup(struct mlx5_core_dev *dev)
+{
+	mlx5_eq_debugfs_cleanup(dev);
+}
+
+int mlx5_start_eqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	int err;
+
+	err = mlx5_create_map_eq(dev, &table->cmd_eq, MLX5_EQ_VEC_CMD,
+				 MLX5_NUM_CMD_EQE, 1ull << MLX5_EVENT_TYPE_CMD,
+				 "mlx5_cmd_eq", &dev->priv.uuari.uars[0]);
+	if (err) {
+		mlx5_core_warn(dev, "failed to create cmd EQ %d\n", err);
+		return err;
+	}
+
+	mlx5_cmd_use_events(dev);
+
+	err = mlx5_create_map_eq(dev, &table->async_eq, MLX5_EQ_VEC_ASYNC,
+				 MLX5_NUM_ASYNC_EQE, MLX5_ASYNC_EVENT_MASK,
+				 "mlx5_async_eq", &dev->priv.uuari.uars[0]);
+	if (err) {
+		mlx5_core_warn(dev, "failed to create async EQ %d\n", err);
+		goto err1;
+	}
+
+	err = mlx5_create_map_eq(dev, &table->pages_eq,
+				 MLX5_EQ_VEC_PAGES,
+				 dev->caps.max_vf + 1,
+				 1 << MLX5_EVENT_TYPE_PAGE_REQUEST, "mlx5_pages_eq",
+				 &dev->priv.uuari.uars[0]);
+	if (err) {
+		mlx5_core_warn(dev, "failed to create pages EQ %d\n", err);
+		goto err2;
+	}
+
+	return err;
+
+err2:
+	mlx5_destroy_unmap_eq(dev, &table->async_eq);
+
+err1:
+	mlx5_cmd_use_polling(dev);
+	mlx5_destroy_unmap_eq(dev, &table->cmd_eq);
+	return err;
+}
+
+int mlx5_stop_eqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	int err;
+
+	err = mlx5_destroy_unmap_eq(dev, &table->pages_eq);
+	if (err)
+		return err;
+
+	mlx5_destroy_unmap_eq(dev, &table->async_eq);
+	mlx5_cmd_use_polling(dev);
+
+	err = mlx5_destroy_unmap_eq(dev, &table->cmd_eq);
+	if (err)
+		mlx5_cmd_use_events(dev);
+
+	return err;
+}
+
+int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		       struct mlx5_query_eq_mbox_out *out, int outlen)
+{
+	struct mlx5_query_eq_mbox_in in;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(out, 0, outlen);
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_EQ);
+	in.eqn = eq->eqn;
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen);
+	if (err)
+		return err;
+
+	if (out->hdr.status)
+		err = mlx5_cmd_status_to_err(&out->hdr);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_eq_query);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw.c b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
new file mode 100644
index 000000000000..72a5222447f5
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/module.h>
+#include "mlx5_core.h"
+
+int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_query_adapter_mbox_out *out;
+	struct mlx5_cmd_query_adapter_mbox_in in;
+	int err;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	memset(&in, 0, sizeof(in));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_ADAPTER);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out));
+	if (err)
+		goto out_out;
+
+	if (out->hdr.status) {
+		err = mlx5_cmd_status_to_err(&out->hdr);
+		goto out_out;
+	}
+
+	memcpy(dev->board_id, out->vsd_psid, sizeof(out->vsd_psid));
+
+out_out:
+	kfree(out);
+
+	return err;
+}
+
+int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev,
+			   struct mlx5_caps *caps)
+{
+	struct mlx5_cmd_query_hca_cap_mbox_out *out;
+	struct mlx5_cmd_query_hca_cap_mbox_in in;
+	struct mlx5_query_special_ctxs_mbox_out ctx_out;
+	struct mlx5_query_special_ctxs_mbox_in ctx_in;
+	int err;
+	u16 t16;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	memset(&in, 0, sizeof(in));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP);
+	in.hdr.opmod  = cpu_to_be16(0x1);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out));
+	if (err)
+		goto out_out;
+
+	if (out->hdr.status) {
+		err = mlx5_cmd_status_to_err(&out->hdr);
+		goto out_out;
+	}
+
+
+	caps->log_max_eq = out->hca_cap.log_max_eq & 0xf;
+	caps->max_cqes = 1 << out->hca_cap.log_max_cq_sz;
+	caps->max_wqes = 1 << out->hca_cap.log_max_qp_sz;
+	caps->max_sq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_sq);
+	caps->max_rq_desc_sz = be16_to_cpu(out->hca_cap.max_desc_sz_rq);
+	caps->flags = be64_to_cpu(out->hca_cap.flags);
+	caps->stat_rate_support = be16_to_cpu(out->hca_cap.stat_rate_support);
+	caps->log_max_msg = out->hca_cap.log_max_msg & 0x1f;
+	caps->num_ports = out->hca_cap.num_ports & 0xf;
+	caps->log_max_cq = out->hca_cap.log_max_cq & 0x1f;
+	if (caps->num_ports > MLX5_MAX_PORTS) {
+		mlx5_core_err(dev, "device has %d ports while the driver supports max %d ports\n",
+			      caps->num_ports, MLX5_MAX_PORTS);
+		err = -EINVAL;
+		goto out_out;
+	}
+	caps->log_max_qp = out->hca_cap.log_max_qp & 0x1f;
+	caps->log_max_mkey = out->hca_cap.log_max_mkey & 0x3f;
+	caps->log_max_pd = out->hca_cap.log_max_pd & 0x1f;
+	caps->log_max_srq = out->hca_cap.log_max_srqs & 0x1f;
+	caps->local_ca_ack_delay = out->hca_cap.local_ca_ack_delay & 0x1f;
+	caps->log_max_mcg = out->hca_cap.log_max_mcg;
+	caps->max_qp_mcg = be16_to_cpu(out->hca_cap.max_qp_mcg);
+	caps->max_ra_res_qp = 1 << (out->hca_cap.log_max_ra_res_qp & 0x3f);
+	caps->max_ra_req_qp = 1 << (out->hca_cap.log_max_ra_req_qp & 0x3f);
+	caps->max_srq_wqes = 1 << out->hca_cap.log_max_srq_sz;
+	t16 = be16_to_cpu(out->hca_cap.bf_log_bf_reg_size);
+	if (t16 & 0x8000) {
+		caps->bf_reg_size = 1 << (t16 & 0x1f);
+		caps->bf_regs_per_page = MLX5_BF_REGS_PER_PAGE;
+	} else {
+		caps->bf_reg_size = 0;
+		caps->bf_regs_per_page = 0;
+	}
+	caps->min_page_sz = ~(u32)((1 << out->hca_cap.log_pg_sz) - 1);
+
+	memset(&ctx_in, 0, sizeof(ctx_in));
+	memset(&ctx_out, 0, sizeof(ctx_out));
+	ctx_in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS);
+	err = mlx5_cmd_exec(dev, &ctx_in, sizeof(ctx_in),
+				 &ctx_out, sizeof(ctx_out));
+	if (err)
+		goto out_out;
+
+	if (ctx_out.hdr.status)
+		err = mlx5_cmd_status_to_err(&ctx_out.hdr);
+
+	caps->reserved_lkey = be32_to_cpu(ctx_out.reserved_lkey);
+
+out_out:
+	kfree(out);
+
+	return err;
+}
+
+int mlx5_cmd_init_hca(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_init_hca_mbox_in in;
+	struct mlx5_cmd_init_hca_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_INIT_HCA);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+
+int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_teardown_hca_mbox_in in;
+	struct mlx5_cmd_teardown_hca_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_TEARDOWN_HCA);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c
new file mode 100644
index 000000000000..ea4b9bca6d4a
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/vmalloc.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+enum {
+	MLX5_HEALTH_POLL_INTERVAL	= 2 * HZ,
+	MAX_MISSES			= 3,
+};
+
+enum {
+	MLX5_HEALTH_SYNDR_FW_ERR		= 0x1,
+	MLX5_HEALTH_SYNDR_IRISC_ERR		= 0x7,
+	MLX5_HEALTH_SYNDR_CRC_ERR		= 0x9,
+	MLX5_HEALTH_SYNDR_FETCH_PCI_ERR		= 0xa,
+	MLX5_HEALTH_SYNDR_HW_FTL_ERR		= 0xb,
+	MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR	= 0xc,
+	MLX5_HEALTH_SYNDR_EQ_ERR		= 0xd,
+	MLX5_HEALTH_SYNDR_FFSER_ERR		= 0xf,
+};
+
+static DEFINE_SPINLOCK(health_lock);
+
+static LIST_HEAD(health_list);
+static struct work_struct health_work;
+
+static health_handler_t reg_handler;
+int mlx5_register_health_report_handler(health_handler_t handler)
+{
+	spin_lock_irq(&health_lock);
+	if (reg_handler) {
+		spin_unlock_irq(&health_lock);
+		return -EEXIST;
+	}
+	reg_handler = handler;
+	spin_unlock_irq(&health_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_register_health_report_handler);
+
+void mlx5_unregister_health_report_handler(void)
+{
+	spin_lock_irq(&health_lock);
+	reg_handler = NULL;
+	spin_unlock_irq(&health_lock);
+}
+EXPORT_SYMBOL(mlx5_unregister_health_report_handler);
+
+static void health_care(struct work_struct *work)
+{
+	struct mlx5_core_health *health, *n;
+	struct mlx5_core_dev *dev;
+	struct mlx5_priv *priv;
+	LIST_HEAD(tlist);
+
+	spin_lock_irq(&health_lock);
+	list_splice_init(&health_list, &tlist);
+
+	spin_unlock_irq(&health_lock);
+
+	list_for_each_entry_safe(health, n, &tlist, list) {
+		priv = container_of(health, struct mlx5_priv, health);
+		dev = container_of(priv, struct mlx5_core_dev, priv);
+		mlx5_core_warn(dev, "handling bad device here\n");
+		spin_lock_irq(&health_lock);
+		if (reg_handler)
+			reg_handler(dev->pdev, health->health,
+				    sizeof(health->health));
+
+		list_del_init(&health->list);
+		spin_unlock_irq(&health_lock);
+	}
+}
+
+static const char *hsynd_str(u8 synd)
+{
+	switch (synd) {
+	case MLX5_HEALTH_SYNDR_FW_ERR:
+		return "firmware internal error";
+	case MLX5_HEALTH_SYNDR_IRISC_ERR:
+		return "irisc not responding";
+	case MLX5_HEALTH_SYNDR_CRC_ERR:
+		return "firmware CRC error";
+	case MLX5_HEALTH_SYNDR_FETCH_PCI_ERR:
+		return "ICM fetch PCI error";
+	case MLX5_HEALTH_SYNDR_HW_FTL_ERR:
+		return "HW fatal error\n";
+	case MLX5_HEALTH_SYNDR_ASYNC_EQ_OVERRUN_ERR:
+		return "async EQ buffer overrun";
+	case MLX5_HEALTH_SYNDR_EQ_ERR:
+		return "EQ error";
+	case MLX5_HEALTH_SYNDR_FFSER_ERR:
+		return "FFSER error";
+	default:
+		return "unrecognized error";
+	}
+}
+
+static void print_health_info(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct health_buffer __iomem *h = health->health;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
+		pr_info("assert_var[%d] 0x%08x\n", i, be32_to_cpu(h->assert_var[i]));
+
+	pr_info("assert_exit_ptr 0x%08x\n", be32_to_cpu(h->assert_exit_ptr));
+	pr_info("assert_callra 0x%08x\n", be32_to_cpu(h->assert_callra));
+	pr_info("fw_ver 0x%08x\n", be32_to_cpu(h->fw_ver));
+	pr_info("hw_id 0x%08x\n", be32_to_cpu(h->hw_id));
+	pr_info("irisc_index %d\n", h->irisc_index);
+	pr_info("synd 0x%x: %s\n", h->synd, hsynd_str(h->synd));
+	pr_info("ext_sync 0x%04x\n", be16_to_cpu(h->ext_sync));
+}
+
+static void poll_health(unsigned long data)
+{
+	struct mlx5_core_dev *dev = (struct mlx5_core_dev *)data;
+	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned long next;
+	u32 count;
+
+	count = ioread32be(health->health_counter);
+	if (count == health->prev)
+		++health->miss_counter;
+	else
+		health->miss_counter = 0;
+
+	health->prev = count;
+	if (health->miss_counter == MAX_MISSES) {
+		mlx5_core_err(dev, "device's health compromised\n");
+		print_health_info(dev);
+		spin_lock_irq(&health_lock);
+		list_add_tail(&health->list, &health_list);
+		spin_unlock_irq(&health_lock);
+
+		queue_work(mlx5_core_wq, &health_work);
+	} else {
+		get_random_bytes(&next, sizeof(next));
+		next %= HZ;
+		next += jiffies + MLX5_HEALTH_POLL_INTERVAL;
+		mod_timer(&health->timer, next);
+	}
+}
+
+void mlx5_start_health_poll(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	INIT_LIST_HEAD(&health->list);
+	init_timer(&health->timer);
+	health->health = &dev->iseg->health;
+	health->health_counter = &dev->iseg->health_counter;
+
+	health->timer.data = (unsigned long)dev;
+	health->timer.function = poll_health;
+	health->timer.expires = round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL);
+	add_timer(&health->timer);
+}
+
+void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+
+	del_timer_sync(&health->timer);
+
+	spin_lock_irq(&health_lock);
+	if (!list_empty(&health->list))
+		list_del_init(&health->list);
+	spin_unlock_irq(&health_lock);
+}
+
+void mlx5_health_cleanup(void)
+{
+}
+
+void  __init mlx5_health_init(void)
+{
+	INIT_WORK(&health_work, health_care);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mad.c b/drivers/net/ethernet/mellanox/mlx5/core/mad.c
new file mode 100644
index 000000000000..18d6fd5dd90b
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mad.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, void *inb, void *outb,
+		      u16 opmod, int port)
+{
+	struct mlx5_mad_ifc_mbox_in *in = NULL;
+	struct mlx5_mad_ifc_mbox_out *out = NULL;
+	int err;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MAD_IFC);
+	in->hdr.opmod = cpu_to_be16(opmod);
+	in->port = port;
+
+	memcpy(in->data, inb, sizeof(in->data));
+
+	err = mlx5_cmd_exec(dev, in, sizeof(*in), out, sizeof(*out));
+	if (err)
+		goto out;
+
+	if (out->hdr.status) {
+		err = mlx5_cmd_status_to_err(&out->hdr);
+		goto out;
+	}
+
+	memcpy(outb, out->data, sizeof(out->data));
+
+out:
+	kfree(out);
+	kfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_mad_ifc);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
new file mode 100644
index 000000000000..f21cc397d1bc
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm-generic/kmap_types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/io-mapping.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/srq.h>
+#include <linux/debugfs.h>
+#include "mlx5_core.h"
+
+#define DRIVER_NAME "mlx5_core"
+#define DRIVER_VERSION "1.0"
+#define DRIVER_RELDATE	"June 2013"
+
+MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox ConnectX-IB HCA core library");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRIVER_VERSION);
+
+int mlx5_core_debug_mask;
+module_param_named(debug_mask, mlx5_core_debug_mask, int, 0644);
+MODULE_PARM_DESC(debug_mask, "debug mask: 1 = dump cmd data, 2 = dump cmd exec time, 3 = both. Default=0");
+
+struct workqueue_struct *mlx5_core_wq;
+
+static int set_dma_caps(struct pci_dev *pdev)
+{
+	int err;
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
+			return err;
+		}
+	}
+
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev,
+			 "Warning: couldn't set 64-bit consistent PCI DMA mask.\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev,
+				"Can't set consistent PCI DMA mask, aborting.\n");
+			return err;
+		}
+	}
+
+	dma_set_max_seg_size(&pdev->dev, 2u * 1024 * 1024 * 1024);
+	return err;
+}
+
+static int request_bar(struct pci_dev *pdev)
+{
+	int err = 0;
+
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing registers BAR, aborting.\n");
+		return -ENODEV;
+	}
+
+	err = pci_request_regions(pdev, DRIVER_NAME);
+	if (err)
+		dev_err(&pdev->dev, "Couldn't get PCI resources, aborting\n");
+
+	return err;
+}
+
+static void release_bar(struct pci_dev *pdev)
+{
+	pci_release_regions(pdev);
+}
+
+static int mlx5_enable_msix(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+	int num_eqs = 1 << dev->caps.log_max_eq;
+	int nvec;
+	int err;
+	int i;
+
+	nvec = dev->caps.num_ports * num_online_cpus() + MLX5_EQ_VEC_COMP_BASE;
+	nvec = min_t(int, nvec, num_eqs);
+	if (nvec <= MLX5_EQ_VEC_COMP_BASE)
+		return -ENOMEM;
+
+	table->msix_arr = kzalloc(nvec * sizeof(*table->msix_arr), GFP_KERNEL);
+	if (!table->msix_arr)
+		return -ENOMEM;
+
+	for (i = 0; i < nvec; i++)
+		table->msix_arr[i].entry = i;
+
+retry:
+	table->num_comp_vectors = nvec - MLX5_EQ_VEC_COMP_BASE;
+	err = pci_enable_msix(dev->pdev, table->msix_arr, nvec);
+	if (err <= 0) {
+		return err;
+	} else if (err > 2) {
+		nvec = err;
+		goto retry;
+	}
+
+	mlx5_core_dbg(dev, "received %d MSI vectors out of %d requested\n", err, nvec);
+
+	return 0;
+}
+
+static void mlx5_disable_msix(struct mlx5_core_dev *dev)
+{
+	struct mlx5_eq_table *table = &dev->priv.eq_table;
+
+	pci_disable_msix(dev->pdev);
+	kfree(table->msix_arr);
+}
+
+struct mlx5_reg_host_endianess {
+	u8	he;
+	u8      rsvd[15];
+};
+
+static int handle_hca_cap(struct mlx5_core_dev *dev)
+{
+	struct mlx5_cmd_query_hca_cap_mbox_out *query_out = NULL;
+	struct mlx5_cmd_set_hca_cap_mbox_in *set_ctx = NULL;
+	struct mlx5_cmd_query_hca_cap_mbox_in query_ctx;
+	struct mlx5_cmd_set_hca_cap_mbox_out set_out;
+	struct mlx5_profile *prof = dev->profile;
+	u64 flags;
+	int csum = 1;
+	int err;
+
+	memset(&query_ctx, 0, sizeof(query_ctx));
+	query_out = kzalloc(sizeof(*query_out), GFP_KERNEL);
+	if (!query_out)
+		return -ENOMEM;
+
+	set_ctx = kzalloc(sizeof(*set_ctx), GFP_KERNEL);
+	if (!set_ctx) {
+		err = -ENOMEM;
+		goto query_ex;
+	}
+
+	query_ctx.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP);
+	query_ctx.hdr.opmod  = cpu_to_be16(0x1);
+	err = mlx5_cmd_exec(dev, &query_ctx, sizeof(query_ctx),
+				 query_out, sizeof(*query_out));
+	if (err)
+		goto query_ex;
+
+	err = mlx5_cmd_status_to_err(&query_out->hdr);
+	if (err) {
+		mlx5_core_warn(dev, "query hca cap failed, %d\n", err);
+		goto query_ex;
+	}
+
+	memcpy(&set_ctx->hca_cap, &query_out->hca_cap,
+	       sizeof(set_ctx->hca_cap));
+
+	if (prof->mask & MLX5_PROF_MASK_CMDIF_CSUM) {
+		csum = !!prof->cmdif_csum;
+		flags = be64_to_cpu(set_ctx->hca_cap.flags);
+		if (csum)
+			flags |= MLX5_DEV_CAP_FLAG_CMDIF_CSUM;
+		else
+			flags &= ~MLX5_DEV_CAP_FLAG_CMDIF_CSUM;
+
+		set_ctx->hca_cap.flags = cpu_to_be64(flags);
+	}
+
+	if (dev->profile->mask & MLX5_PROF_MASK_QP_SIZE)
+		set_ctx->hca_cap.log_max_qp = dev->profile->log_max_qp;
+
+	memset(&set_out, 0, sizeof(set_out));
+	set_ctx->hca_cap.uar_page_sz = cpu_to_be16(PAGE_SHIFT - 12);
+	set_ctx->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_SET_HCA_CAP);
+	err = mlx5_cmd_exec(dev, set_ctx, sizeof(*set_ctx),
+				 &set_out, sizeof(set_out));
+	if (err) {
+		mlx5_core_warn(dev, "set hca cap failed, %d\n", err);
+		goto query_ex;
+	}
+
+	err = mlx5_cmd_status_to_err(&set_out.hdr);
+	if (err)
+		goto query_ex;
+
+	if (!csum)
+		dev->cmd.checksum_disabled = 1;
+
+query_ex:
+	kfree(query_out);
+	kfree(set_ctx);
+
+	return err;
+}
+
+static int set_hca_ctrl(struct mlx5_core_dev *dev)
+{
+	struct mlx5_reg_host_endianess he_in;
+	struct mlx5_reg_host_endianess he_out;
+	int err;
+
+	memset(&he_in, 0, sizeof(he_in));
+	he_in.he = MLX5_SET_HOST_ENDIANNESS;
+	err = mlx5_core_access_reg(dev, &he_in,  sizeof(he_in),
+					&he_out, sizeof(he_out),
+					MLX5_REG_HOST_ENDIANNESS, 0, 1);
+	return err;
+}
+
+int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+	int err;
+
+	dev->pdev = pdev;
+	pci_set_drvdata(dev->pdev, dev);
+	strncpy(priv->name, dev_name(&pdev->dev), MLX5_MAX_NAME_LEN);
+	priv->name[MLX5_MAX_NAME_LEN - 1] = 0;
+
+	mutex_init(&priv->pgdir_mutex);
+	INIT_LIST_HEAD(&priv->pgdir_list);
+	spin_lock_init(&priv->mkey_lock);
+
+	priv->dbg_root = debugfs_create_dir(dev_name(&pdev->dev), mlx5_debugfs_root);
+	if (!priv->dbg_root)
+		return -ENOMEM;
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, aborting.\n");
+		goto err_dbg;
+	}
+
+	err = request_bar(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "error requesting BARs, aborting.\n");
+		goto err_disable;
+	}
+
+	pci_set_master(pdev);
+
+	err = set_dma_caps(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed setting DMA capabilities mask, aborting\n");
+		goto err_clr_master;
+	}
+
+	dev->iseg_base = pci_resource_start(dev->pdev, 0);
+	dev->iseg = ioremap(dev->iseg_base, sizeof(*dev->iseg));
+	if (!dev->iseg) {
+		err = -ENOMEM;
+		dev_err(&pdev->dev, "Failed mapping initialization segment, aborting\n");
+		goto err_clr_master;
+	}
+	dev_info(&pdev->dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev),
+		 fw_rev_min(dev), fw_rev_sub(dev));
+
+	err = mlx5_cmd_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed initializing command interface, aborting\n");
+		goto err_unmap;
+	}
+
+	mlx5_pagealloc_init(dev);
+	err = set_hca_ctrl(dev);
+	if (err) {
+		dev_err(&pdev->dev, "set_hca_ctrl failed\n");
+		goto err_pagealloc_cleanup;
+	}
+
+	err = handle_hca_cap(dev);
+	if (err) {
+		dev_err(&pdev->dev, "handle_hca_cap failed\n");
+		goto err_pagealloc_cleanup;
+	}
+
+	err = mlx5_satisfy_startup_pages(dev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to allocate startup pages\n");
+		goto err_pagealloc_cleanup;
+	}
+
+	err = mlx5_pagealloc_start(dev);
+	if (err) {
+		dev_err(&pdev->dev, "mlx5_pagealloc_start failed\n");
+		goto err_reclaim_pages;
+	}
+
+	err = mlx5_cmd_init_hca(dev);
+	if (err) {
+		dev_err(&pdev->dev, "init hca failed\n");
+		goto err_pagealloc_stop;
+	}
+
+	mlx5_start_health_poll(dev);
+
+	err = mlx5_cmd_query_hca_cap(dev, &dev->caps);
+	if (err) {
+		dev_err(&pdev->dev, "query hca failed\n");
+		goto err_stop_poll;
+	}
+
+	err = mlx5_cmd_query_adapter(dev);
+	if (err) {
+		dev_err(&pdev->dev, "query adapter failed\n");
+		goto err_stop_poll;
+	}
+
+	err = mlx5_enable_msix(dev);
+	if (err) {
+		dev_err(&pdev->dev, "enable msix failed\n");
+		goto err_stop_poll;
+	}
+
+	err = mlx5_eq_init(dev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to initialize eq\n");
+		goto disable_msix;
+	}
+
+	err = mlx5_alloc_uuars(dev, &priv->uuari);
+	if (err) {
+		dev_err(&pdev->dev, "Failed allocating uar, aborting\n");
+		goto err_eq_cleanup;
+	}
+
+	err = mlx5_start_eqs(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to start pages and async EQs\n");
+		goto err_free_uar;
+	}
+
+	MLX5_INIT_DOORBELL_LOCK(&priv->cq_uar_lock);
+
+	mlx5_init_cq_table(dev);
+	mlx5_init_qp_table(dev);
+	mlx5_init_srq_table(dev);
+
+	return 0;
+
+err_free_uar:
+	mlx5_free_uuars(dev, &priv->uuari);
+
+err_eq_cleanup:
+	mlx5_eq_cleanup(dev);
+
+disable_msix:
+	mlx5_disable_msix(dev);
+
+err_stop_poll:
+	mlx5_stop_health_poll(dev);
+	mlx5_cmd_teardown_hca(dev);
+
+err_pagealloc_stop:
+	mlx5_pagealloc_stop(dev);
+
+err_reclaim_pages:
+	mlx5_reclaim_startup_pages(dev);
+
+err_pagealloc_cleanup:
+	mlx5_pagealloc_cleanup(dev);
+	mlx5_cmd_cleanup(dev);
+
+err_unmap:
+	iounmap(dev->iseg);
+
+err_clr_master:
+	pci_clear_master(dev->pdev);
+	release_bar(dev->pdev);
+
+err_disable:
+	pci_disable_device(dev->pdev);
+
+err_dbg:
+	debugfs_remove(priv->dbg_root);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_dev_init);
+
+void mlx5_dev_cleanup(struct mlx5_core_dev *dev)
+{
+	struct mlx5_priv *priv = &dev->priv;
+
+	mlx5_cleanup_srq_table(dev);
+	mlx5_cleanup_qp_table(dev);
+	mlx5_cleanup_cq_table(dev);
+	mlx5_stop_eqs(dev);
+	mlx5_free_uuars(dev, &priv->uuari);
+	mlx5_eq_cleanup(dev);
+	mlx5_disable_msix(dev);
+	mlx5_stop_health_poll(dev);
+	mlx5_cmd_teardown_hca(dev);
+	mlx5_pagealloc_stop(dev);
+	mlx5_reclaim_startup_pages(dev);
+	mlx5_pagealloc_cleanup(dev);
+	mlx5_cmd_cleanup(dev);
+	iounmap(dev->iseg);
+	pci_clear_master(dev->pdev);
+	release_bar(dev->pdev);
+	pci_disable_device(dev->pdev);
+	debugfs_remove(priv->dbg_root);
+}
+EXPORT_SYMBOL(mlx5_dev_cleanup);
+
+static int __init init(void)
+{
+	int err;
+
+	mlx5_register_debugfs();
+	mlx5_core_wq = create_singlethread_workqueue("mlx5_core_wq");
+	if (!mlx5_core_wq) {
+		err = -ENOMEM;
+		goto err_debug;
+	}
+	mlx5_health_init();
+
+	return 0;
+
+	mlx5_health_cleanup();
+err_debug:
+	mlx5_unregister_debugfs();
+	return err;
+}
+
+static void __exit cleanup(void)
+{
+	mlx5_health_cleanup();
+	destroy_workqueue(mlx5_core_wq);
+	mlx5_unregister_debugfs();
+}
+
+module_init(init);
+module_exit(cleanup);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mcg.c b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c
new file mode 100644
index 000000000000..44837640bd7c
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mcg.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <rdma/ib_verbs.h>
+#include "mlx5_core.h"
+
+struct mlx5_attach_mcg_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			qpn;
+	__be32			rsvd;
+	u8			gid[16];
+};
+
+struct mlx5_attach_mcg_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvf[8];
+};
+
+struct mlx5_detach_mcg_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			qpn;
+	__be32			rsvd;
+	u8			gid[16];
+};
+
+struct mlx5_detach_mcg_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvf[8];
+};
+
+int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn)
+{
+	struct mlx5_attach_mcg_mbox_in in;
+	struct mlx5_attach_mcg_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ATTACH_TO_MCG);
+	memcpy(in.gid, mgid, sizeof(*mgid));
+	in.qpn = cpu_to_be32(qpn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_attach_mcg);
+
+int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn)
+{
+	struct mlx5_detach_mcg_mbox_in in;
+	struct mlx5_detach_mcg_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DETACH_FROM_MCG);
+	memcpy(in.gid, mgid, sizeof(*mgid));
+	in.qpn = cpu_to_be32(qpn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_detach_mcg);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
new file mode 100644
index 000000000000..68b74e1ae1b0
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MLX5_CORE_H__
+#define __MLX5_CORE_H__
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+
+extern int mlx5_core_debug_mask;
+
+#define mlx5_core_dbg(dev, format, arg...)				       \
+pr_debug("%s:%s:%d:(pid %d): " format, (dev)->priv.name, __func__, __LINE__,   \
+	 current->pid, ##arg)
+
+#define mlx5_core_dbg_mask(dev, mask, format, arg...)			       \
+do {									       \
+	if ((mask) & mlx5_core_debug_mask)				       \
+		pr_debug("%s:%s:%d:(pid %d): " format, (dev)->priv.name,       \
+			 __func__, __LINE__, current->pid, ##arg);	       \
+} while (0)
+
+#define mlx5_core_err(dev, format, arg...) \
+pr_err("%s:%s:%d:(pid %d): " format, (dev)->priv.name, __func__, __LINE__,     \
+	current->pid, ##arg)
+
+#define mlx5_core_warn(dev, format, arg...) \
+pr_warn("%s:%s:%d:(pid %d): " format, (dev)->priv.name, __func__, __LINE__,    \
+	current->pid, ##arg)
+
+enum {
+	MLX5_CMD_DATA, /* print command payload only */
+	MLX5_CMD_TIME, /* print command execution time */
+};
+
+
+int mlx5_cmd_query_hca_cap(struct mlx5_core_dev *dev,
+			   struct mlx5_caps *caps);
+int mlx5_cmd_query_adapter(struct mlx5_core_dev *dev);
+int mlx5_cmd_init_hca(struct mlx5_core_dev *dev);
+int mlx5_cmd_teardown_hca(struct mlx5_core_dev *dev);
+
+#endif /* __MLX5_CORE_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
new file mode 100644
index 000000000000..5b44e2e46daf
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
+			  struct mlx5_create_mkey_mbox_in *in, int inlen)
+{
+	struct mlx5_create_mkey_mbox_out out;
+	int err;
+	u8 key;
+
+	memset(&out, 0, sizeof(out));
+	spin_lock(&dev->priv.mkey_lock);
+	key = dev->priv.mkey_key++;
+	spin_unlock(&dev->priv.mkey_lock);
+	in->seg.qpn_mkey7_0 |= cpu_to_be32(key);
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_MKEY);
+	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	if (err) {
+		mlx5_core_dbg(dev, "cmd exec faile %d\n", err);
+		return err;
+	}
+
+	if (out.hdr.status) {
+		mlx5_core_dbg(dev, "status %d\n", out.hdr.status);
+		return mlx5_cmd_status_to_err(&out.hdr);
+	}
+
+	mr->key = mlx5_idx_to_mkey(be32_to_cpu(out.mkey) & 0xffffff) | key;
+	mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", be32_to_cpu(out.mkey), key, mr->key);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_mkey);
+
+int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr)
+{
+	struct mlx5_destroy_mkey_mbox_in in;
+	struct mlx5_destroy_mkey_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_MKEY);
+	in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mr->key));
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_destroy_mkey);
+
+int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
+			 struct mlx5_query_mkey_mbox_out *out, int outlen)
+{
+	struct mlx5_destroy_mkey_mbox_in in;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(out, 0, outlen);
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_MKEY);
+	in.mkey = cpu_to_be32(mlx5_mkey_to_idx(mr->key));
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen);
+	if (err)
+		return err;
+
+	if (out->hdr.status)
+		return mlx5_cmd_status_to_err(&out->hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_query_mkey);
+
+int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
+			     u32 *mkey)
+{
+	struct mlx5_query_special_ctxs_mbox_in in;
+	struct mlx5_query_special_ctxs_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	*mkey = be32_to_cpu(out.dump_fill_mkey);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_dump_fill_mkey);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
new file mode 100644
index 000000000000..f0bf46339b28
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
@@ -0,0 +1,435 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm-generic/kmap_types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+enum {
+	MLX5_PAGES_CANT_GIVE	= 0,
+	MLX5_PAGES_GIVE		= 1,
+	MLX5_PAGES_TAKE		= 2
+};
+
+struct mlx5_pages_req {
+	struct mlx5_core_dev *dev;
+	u32	func_id;
+	s16	npages;
+	struct work_struct work;
+};
+
+struct fw_page {
+	struct rb_node	rb_node;
+	u64		addr;
+	struct page	*page;
+	u16		func_id;
+};
+
+struct mlx5_query_pages_inbox {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_query_pages_outbox {
+	struct mlx5_outbox_hdr	hdr;
+	u8			reserved[2];
+	__be16			func_id;
+	__be16			init_pages;
+	__be16			num_pages;
+};
+
+struct mlx5_manage_pages_inbox {
+	struct mlx5_inbox_hdr	hdr;
+	__be16			rsvd0;
+	__be16			func_id;
+	__be16			rsvd1;
+	__be16			num_entries;
+	u8			rsvd2[16];
+	__be64			pas[0];
+};
+
+struct mlx5_manage_pages_outbox {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[2];
+	__be16			num_entries;
+	u8			rsvd1[20];
+	__be64			pas[0];
+};
+
+static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u16 func_id)
+{
+	struct rb_root *root = &dev->priv.page_root;
+	struct rb_node **new = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct fw_page *nfp;
+	struct fw_page *tfp;
+
+	while (*new) {
+		parent = *new;
+		tfp = rb_entry(parent, struct fw_page, rb_node);
+		if (tfp->addr < addr)
+			new = &parent->rb_left;
+		else if (tfp->addr > addr)
+			new = &parent->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	nfp = kmalloc(sizeof(*nfp), GFP_KERNEL);
+	if (!nfp)
+		return -ENOMEM;
+
+	nfp->addr = addr;
+	nfp->page = page;
+	nfp->func_id = func_id;
+
+	rb_link_node(&nfp->rb_node, parent, new);
+	rb_insert_color(&nfp->rb_node, root);
+
+	return 0;
+}
+
+static struct page *remove_page(struct mlx5_core_dev *dev, u64 addr)
+{
+	struct rb_root *root = &dev->priv.page_root;
+	struct rb_node *tmp = root->rb_node;
+	struct page *result = NULL;
+	struct fw_page *tfp;
+
+	while (tmp) {
+		tfp = rb_entry(tmp, struct fw_page, rb_node);
+		if (tfp->addr < addr) {
+			tmp = tmp->rb_left;
+		} else if (tfp->addr > addr) {
+			tmp = tmp->rb_right;
+		} else {
+			rb_erase(&tfp->rb_node, root);
+			result = tfp->page;
+			kfree(tfp);
+			break;
+		}
+	}
+
+	return result;
+}
+
+static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id,
+				s16 *pages, s16 *init_pages)
+{
+	struct mlx5_query_pages_inbox	in;
+	struct mlx5_query_pages_outbox	out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_PAGES);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	if (pages)
+		*pages = be16_to_cpu(out.num_pages);
+	if (init_pages)
+		*init_pages = be16_to_cpu(out.init_pages);
+	*func_id = be16_to_cpu(out.func_id);
+
+	return err;
+}
+
+static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages,
+		      int notify_fail)
+{
+	struct mlx5_manage_pages_inbox *in;
+	struct mlx5_manage_pages_outbox out;
+	struct page *page;
+	int inlen;
+	u64 addr;
+	int err;
+	int i;
+
+	inlen = sizeof(*in) + npages * sizeof(in->pas[0]);
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		mlx5_core_warn(dev, "vzalloc failed %d\n", inlen);
+		return -ENOMEM;
+	}
+	memset(&out, 0, sizeof(out));
+
+	for (i = 0; i < npages; i++) {
+		page = alloc_page(GFP_HIGHUSER);
+		if (!page) {
+			err = -ENOMEM;
+			mlx5_core_warn(dev, "failed to allocate page\n");
+			goto out_alloc;
+		}
+		addr = dma_map_page(&dev->pdev->dev, page, 0,
+				    PAGE_SIZE, DMA_BIDIRECTIONAL);
+		if (dma_mapping_error(&dev->pdev->dev, addr)) {
+			mlx5_core_warn(dev, "failed dma mapping page\n");
+			__free_page(page);
+			err = -ENOMEM;
+			goto out_alloc;
+		}
+		err = insert_page(dev, addr, page, func_id);
+		if (err) {
+			mlx5_core_err(dev, "failed to track allocated page\n");
+			dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+			__free_page(page);
+			err = -ENOMEM;
+			goto out_alloc;
+		}
+		in->pas[i] = cpu_to_be64(addr);
+	}
+
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES);
+	in->hdr.opmod = cpu_to_be16(MLX5_PAGES_GIVE);
+	in->func_id = cpu_to_be16(func_id);
+	in->num_entries = cpu_to_be16(npages);
+	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	mlx5_core_dbg(dev, "err %d\n", err);
+	if (err) {
+		mlx5_core_warn(dev, "func_id 0x%x, npages %d, err %d\n", func_id, npages, err);
+		goto out_alloc;
+	}
+	dev->priv.fw_pages += npages;
+
+	if (out.hdr.status) {
+		err = mlx5_cmd_status_to_err(&out.hdr);
+		if (err) {
+			mlx5_core_warn(dev, "func_id 0x%x, npages %d, status %d\n", func_id, npages, out.hdr.status);
+			goto out_alloc;
+		}
+	}
+
+	mlx5_core_dbg(dev, "err %d\n", err);
+
+	goto out_free;
+
+out_alloc:
+	if (notify_fail) {
+		memset(in, 0, inlen);
+		memset(&out, 0, sizeof(out));
+		in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES);
+		in->hdr.opmod = cpu_to_be16(MLX5_PAGES_CANT_GIVE);
+		if (mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out)))
+			mlx5_core_warn(dev, "\n");
+	}
+	for (i--; i >= 0; i--) {
+		addr = be64_to_cpu(in->pas[i]);
+		page = remove_page(dev, addr);
+		if (!page) {
+			mlx5_core_err(dev, "BUG: can't remove page at addr 0x%llx\n",
+				      addr);
+			continue;
+		}
+		dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+		__free_page(page);
+	}
+
+out_free:
+	mlx5_vfree(in);
+	return err;
+}
+
+static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages,
+			 int *nclaimed)
+{
+	struct mlx5_manage_pages_inbox   in;
+	struct mlx5_manage_pages_outbox *out;
+	struct page *page;
+	int num_claimed;
+	int outlen;
+	u64 addr;
+	int err;
+	int i;
+
+	memset(&in, 0, sizeof(in));
+	outlen = sizeof(*out) + npages * sizeof(out->pas[0]);
+	out = mlx5_vzalloc(outlen);
+	if (!out)
+		return -ENOMEM;
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES);
+	in.hdr.opmod = cpu_to_be16(MLX5_PAGES_TAKE);
+	in.func_id = cpu_to_be16(func_id);
+	in.num_entries = cpu_to_be16(npages);
+	mlx5_core_dbg(dev, "npages %d, outlen %d\n", npages, outlen);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen);
+	if (err) {
+		mlx5_core_err(dev, "failed recliaming pages\n");
+		goto out_free;
+	}
+	dev->priv.fw_pages -= npages;
+
+	if (out->hdr.status) {
+		err = mlx5_cmd_status_to_err(&out->hdr);
+		goto out_free;
+	}
+
+	num_claimed = be16_to_cpu(out->num_entries);
+	if (nclaimed)
+		*nclaimed = num_claimed;
+
+	for (i = 0; i < num_claimed; i++) {
+		addr = be64_to_cpu(out->pas[i]);
+		page = remove_page(dev, addr);
+		if (!page) {
+			mlx5_core_warn(dev, "FW reported unknown DMA address 0x%llx\n", addr);
+		} else {
+			dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
+			__free_page(page);
+		}
+	}
+
+out_free:
+	mlx5_vfree(out);
+	return err;
+}
+
+static void pages_work_handler(struct work_struct *work)
+{
+	struct mlx5_pages_req *req = container_of(work, struct mlx5_pages_req, work);
+	struct mlx5_core_dev *dev = req->dev;
+	int err = 0;
+
+	if (req->npages < 0)
+		err = reclaim_pages(dev, req->func_id, -1 * req->npages, NULL);
+	else if (req->npages > 0)
+		err = give_pages(dev, req->func_id, req->npages, 1);
+
+	if (err)
+		mlx5_core_warn(dev, "%s fail %d\n", req->npages < 0 ?
+			       "reclaim" : "give", err);
+
+	kfree(req);
+}
+
+void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
+				 s16 npages)
+{
+	struct mlx5_pages_req *req;
+
+	req = kzalloc(sizeof(*req), GFP_ATOMIC);
+	if (!req) {
+		mlx5_core_warn(dev, "failed to allocate pages request\n");
+		return;
+	}
+
+	req->dev = dev;
+	req->func_id = func_id;
+	req->npages = npages;
+	INIT_WORK(&req->work, pages_work_handler);
+	queue_work(dev->priv.pg_wq, &req->work);
+}
+
+int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev)
+{
+	s16 uninitialized_var(init_pages);
+	u16 uninitialized_var(func_id);
+	int err;
+
+	err = mlx5_cmd_query_pages(dev, &func_id, NULL, &init_pages);
+	if (err)
+		return err;
+
+	mlx5_core_dbg(dev, "requested %d init pages for func_id 0x%x\n", init_pages, func_id);
+
+	return give_pages(dev, func_id, init_pages, 0);
+}
+
+static int optimal_reclaimed_pages(void)
+{
+	struct mlx5_cmd_prot_block *block;
+	struct mlx5_cmd_layout *lay;
+	int ret;
+
+	ret = (sizeof(lay->in) + sizeof(block->data) -
+	       sizeof(struct mlx5_manage_pages_outbox)) / 8;
+
+	return ret;
+}
+
+int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev)
+{
+	unsigned long end = jiffies + msecs_to_jiffies(5000);
+	struct fw_page *fwp;
+	struct rb_node *p;
+	int err;
+
+	do {
+		p = rb_first(&dev->priv.page_root);
+		if (p) {
+			fwp = rb_entry(p, struct fw_page, rb_node);
+			err = reclaim_pages(dev, fwp->func_id, optimal_reclaimed_pages(), NULL);
+			if (err) {
+				mlx5_core_warn(dev, "failed reclaiming pages (%d)\n", err);
+				return err;
+			}
+		}
+		if (time_after(jiffies, end)) {
+			mlx5_core_warn(dev, "FW did not return all pages. giving up...\n");
+			break;
+		}
+	} while (p);
+
+	return 0;
+}
+
+void mlx5_pagealloc_init(struct mlx5_core_dev *dev)
+{
+	dev->priv.page_root = RB_ROOT;
+}
+
+void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev)
+{
+	/* nothing */
+}
+
+int mlx5_pagealloc_start(struct mlx5_core_dev *dev)
+{
+	dev->priv.pg_wq = create_singlethread_workqueue("mlx5_page_allocator");
+	if (!dev->priv.pg_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_pagealloc_stop(struct mlx5_core_dev *dev)
+{
+	destroy_workqueue(dev->priv.pg_wq);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pd.c b/drivers/net/ethernet/mellanox/mlx5/core/pd.c
new file mode 100644
index 000000000000..790da5c4ca4f
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pd.c
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+struct mlx5_alloc_pd_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_alloc_pd_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			pdn;
+	u8			rsvd[4];
+};
+
+struct mlx5_dealloc_pd_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			pdn;
+	u8			rsvd[4];
+};
+
+struct mlx5_dealloc_pd_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn)
+{
+	struct mlx5_alloc_pd_mbox_in	in;
+	struct mlx5_alloc_pd_mbox_out	out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ALLOC_PD);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	*pdn = be32_to_cpu(out.pdn) & 0xffffff;
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_alloc_pd);
+
+int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn)
+{
+	struct mlx5_dealloc_pd_mbox_in	in;
+	struct mlx5_dealloc_pd_mbox_out	out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DEALLOC_PD);
+	in.pdn = cpu_to_be32(pdn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_dealloc_pd);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/port.c b/drivers/net/ethernet/mellanox/mlx5/core/port.c
new file mode 100644
index 000000000000..f6afe7b5a675
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/port.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in,
+			 int size_in, void *data_out, int size_out,
+			 u16 reg_num, int arg, int write)
+{
+	struct mlx5_access_reg_mbox_in *in = NULL;
+	struct mlx5_access_reg_mbox_out *out = NULL;
+	int err = -ENOMEM;
+
+	in = mlx5_vzalloc(sizeof(*in) + size_in);
+	if (!in)
+		return -ENOMEM;
+
+	out = mlx5_vzalloc(sizeof(*out) + size_out);
+	if (!out)
+		goto ex1;
+
+	memcpy(in->data, data_in, size_in);
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ACCESS_REG);
+	in->hdr.opmod = cpu_to_be16(!write);
+	in->arg = cpu_to_be32(arg);
+	in->register_id = cpu_to_be16(reg_num);
+	err = mlx5_cmd_exec(dev, in, sizeof(*in) + size_in, out,
+			    sizeof(out) + size_out);
+	if (err)
+		goto ex2;
+
+	if (out->hdr.status)
+		err = mlx5_cmd_status_to_err(&out->hdr);
+
+	if (!err)
+		memcpy(data_out, out->data, size_out);
+
+ex2:
+	mlx5_vfree(out);
+ex1:
+	mlx5_vfree(in);
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_access_reg);
+
+
+struct mlx5_reg_pcap {
+	u8			rsvd0;
+	u8			port_num;
+	u8			rsvd1[2];
+	__be32			caps_127_96;
+	__be32			caps_95_64;
+	__be32			caps_63_32;
+	__be32			caps_31_0;
+};
+
+int mlx5_set_port_caps(struct mlx5_core_dev *dev, int port_num, u32 caps)
+{
+	struct mlx5_reg_pcap in;
+	struct mlx5_reg_pcap out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	in.caps_127_96 = cpu_to_be32(caps);
+	in.port_num = port_num;
+
+	err = mlx5_core_access_reg(dev, &in, sizeof(in), &out,
+				   sizeof(out), MLX5_REG_PCAP, 0, 1);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_set_port_caps);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
new file mode 100644
index 000000000000..54faf8bfcaf4
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <linux/gfp.h>
+#include <linux/export.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/driver.h>
+
+#include "mlx5_core.h"
+
+void mlx5_qp_event(struct mlx5_core_dev *dev, u32 qpn, int event_type)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	struct mlx5_core_qp *qp;
+
+	spin_lock(&table->lock);
+
+	qp = radix_tree_lookup(&table->tree, qpn);
+	if (qp)
+		atomic_inc(&qp->refcount);
+
+	spin_unlock(&table->lock);
+
+	if (!qp) {
+		mlx5_core_warn(dev, "Async event for bogus QP 0x%x\n", qpn);
+		return;
+	}
+
+	qp->event(qp, event_type);
+
+	if (atomic_dec_and_test(&qp->refcount))
+		complete(&qp->free);
+}
+
+int mlx5_core_create_qp(struct mlx5_core_dev *dev,
+			struct mlx5_core_qp *qp,
+			struct mlx5_create_qp_mbox_in *in,
+			int inlen)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	struct mlx5_create_qp_mbox_out out;
+	struct mlx5_destroy_qp_mbox_in din;
+	struct mlx5_destroy_qp_mbox_out dout;
+	int err;
+
+	memset(&dout, 0, sizeof(dout));
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_QP);
+
+	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	if (err) {
+		mlx5_core_warn(dev, "ret %d", err);
+		return err;
+	}
+
+	if (out.hdr.status) {
+		pr_warn("current num of QPs 0x%x\n", atomic_read(&dev->num_qps));
+		return mlx5_cmd_status_to_err(&out.hdr);
+	}
+
+	qp->qpn = be32_to_cpu(out.qpn) & 0xffffff;
+	mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn);
+
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree, qp->qpn, qp);
+	spin_unlock_irq(&table->lock);
+	if (err) {
+		mlx5_core_warn(dev, "err %d", err);
+		goto err_cmd;
+	}
+
+	err = mlx5_debug_qp_add(dev, qp);
+	if (err)
+		mlx5_core_dbg(dev, "failed adding QP 0x%x to debug file system\n",
+			      qp->qpn);
+
+	qp->pid = current->pid;
+	atomic_set(&qp->refcount, 1);
+	atomic_inc(&dev->num_qps);
+	init_completion(&qp->free);
+
+	return 0;
+
+err_cmd:
+	memset(&din, 0, sizeof(din));
+	memset(&dout, 0, sizeof(dout));
+	din.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_QP);
+	din.qpn = cpu_to_be32(qp->qpn);
+	mlx5_cmd_exec(dev, &din, sizeof(din), &out, sizeof(dout));
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_create_qp);
+
+int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
+			 struct mlx5_core_qp *qp)
+{
+	struct mlx5_destroy_qp_mbox_in in;
+	struct mlx5_destroy_qp_mbox_out out;
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+	unsigned long flags;
+	int err;
+
+	mlx5_debug_qp_remove(dev, qp);
+
+	spin_lock_irqsave(&table->lock, flags);
+	radix_tree_delete(&table->tree, qp->qpn);
+	spin_unlock_irqrestore(&table->lock, flags);
+
+	if (atomic_dec_and_test(&qp->refcount))
+		complete(&qp->free);
+	wait_for_completion(&qp->free);
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_QP);
+	in.qpn = cpu_to_be32(qp->qpn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	atomic_dec(&dev->num_qps);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp);
+
+int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state,
+			enum mlx5_qp_state new_state,
+			struct mlx5_modify_qp_mbox_in *in, int sqd_event,
+			struct mlx5_core_qp *qp)
+{
+	static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
+		[MLX5_QP_STATE_RST] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_INIT]	= MLX5_CMD_OP_RST2INIT_QP,
+		},
+		[MLX5_QP_STATE_INIT]  = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_INIT]	= MLX5_CMD_OP_INIT2INIT_QP,
+			[MLX5_QP_STATE_RTR]	= MLX5_CMD_OP_INIT2RTR_QP,
+		},
+		[MLX5_QP_STATE_RTR]   = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_RTR2RTS_QP,
+		},
+		[MLX5_QP_STATE_RTS]   = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_RTS2RTS_QP,
+			[MLX5_QP_STATE_SQD]	= MLX5_CMD_OP_RTS2SQD_QP,
+		},
+		[MLX5_QP_STATE_SQD] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_SQD2RTS_QP,
+			[MLX5_QP_STATE_SQD]	= MLX5_CMD_OP_SQD2SQD_QP,
+		},
+		[MLX5_QP_STATE_SQER] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_SQERR2RTS_QP,
+		},
+		[MLX5_QP_STATE_ERR] = {
+			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
+			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
+		}
+	};
+
+	struct mlx5_modify_qp_mbox_out out;
+	int err = 0;
+	u16 op;
+
+	if (cur_state >= MLX5_QP_NUM_STATE || new_state >= MLX5_QP_NUM_STATE ||
+	    !optab[cur_state][new_state])
+		return -EINVAL;
+
+	memset(&out, 0, sizeof(out));
+	op = optab[cur_state][new_state];
+	in->hdr.opcode = cpu_to_be16(op);
+	in->qpn = cpu_to_be32(qp->qpn);
+	err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	return mlx5_cmd_status_to_err(&out.hdr);
+}
+EXPORT_SYMBOL_GPL(mlx5_core_qp_modify);
+
+void mlx5_init_qp_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_qp_table *table = &dev->priv.qp_table;
+
+	spin_lock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+	mlx5_qp_debugfs_init(dev);
+}
+
+void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev)
+{
+	mlx5_qp_debugfs_cleanup(dev);
+}
+
+int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp,
+		       struct mlx5_query_qp_mbox_out *out, int outlen)
+{
+	struct mlx5_query_qp_mbox_in in;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(out, 0, outlen);
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_QP);
+	in.qpn = cpu_to_be32(qp->qpn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, outlen);
+	if (err)
+		return err;
+
+	if (out->hdr.status)
+		return mlx5_cmd_status_to_err(&out->hdr);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_qp_query);
+
+int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn)
+{
+	struct mlx5_alloc_xrcd_mbox_in in;
+	struct mlx5_alloc_xrcd_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ALLOC_XRCD);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+	else
+		*xrcdn = be32_to_cpu(out.xrcdn);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_xrcd_alloc);
+
+int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn)
+{
+	struct mlx5_dealloc_xrcd_mbox_in in;
+	struct mlx5_dealloc_xrcd_mbox_out out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DEALLOC_XRCD);
+	in.xrcdn = cpu_to_be32(xrcdn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(mlx5_core_xrcd_dealloc);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
new file mode 100644
index 000000000000..38bce93f8314
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include <linux/mlx5/srq.h>
+#include <rdma/ib_verbs.h>
+#include "mlx5_core.h"
+
+void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_core_srq *srq;
+
+	spin_lock(&table->lock);
+
+	srq = radix_tree_lookup(&table->tree, srqn);
+	if (srq)
+		atomic_inc(&srq->refcount);
+
+	spin_unlock(&table->lock);
+
+	if (!srq) {
+		mlx5_core_warn(dev, "Async event for bogus SRQ 0x%08x\n", srqn);
+		return;
+	}
+
+	srq->event(srq, event_type);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+}
+
+struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_core_srq *srq;
+
+	spin_lock(&table->lock);
+
+	srq = radix_tree_lookup(&table->tree, srqn);
+	if (srq)
+		atomic_inc(&srq->refcount);
+
+	spin_unlock(&table->lock);
+
+	return srq;
+}
+EXPORT_SYMBOL(mlx5_core_get_srq);
+
+int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_create_srq_mbox_in *in, int inlen)
+{
+	struct mlx5_create_srq_mbox_out out;
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_destroy_srq_mbox_in din;
+	struct mlx5_destroy_srq_mbox_out dout;
+	int err;
+
+	memset(&out, 0, sizeof(out));
+	in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_SRQ);
+	err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	srq->srqn = be32_to_cpu(out.srqn) & 0xffffff;
+
+	atomic_set(&srq->refcount, 1);
+	init_completion(&srq->free);
+
+	spin_lock_irq(&table->lock);
+	err = radix_tree_insert(&table->tree, srq->srqn, srq);
+	spin_unlock_irq(&table->lock);
+	if (err) {
+		mlx5_core_warn(dev, "err %d, srqn 0x%x\n", err, srq->srqn);
+		goto err_cmd;
+	}
+
+	return 0;
+
+err_cmd:
+	memset(&din, 0, sizeof(din));
+	memset(&dout, 0, sizeof(dout));
+	din.srqn = cpu_to_be32(srq->srqn);
+	din.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_SRQ);
+	mlx5_cmd_exec(dev, &din, sizeof(din), &dout, sizeof(dout));
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_create_srq);
+
+int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq)
+{
+	struct mlx5_destroy_srq_mbox_in in;
+	struct mlx5_destroy_srq_mbox_out out;
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+	struct mlx5_core_srq *tmp;
+	int err;
+
+	spin_lock_irq(&table->lock);
+	tmp = radix_tree_delete(&table->tree, srq->srqn);
+	spin_unlock_irq(&table->lock);
+	if (!tmp) {
+		mlx5_core_warn(dev, "srq 0x%x not found in tree\n", srq->srqn);
+		return -EINVAL;
+	}
+	if (tmp != srq) {
+		mlx5_core_warn(dev, "corruption on srqn 0x%x\n", srq->srqn);
+		return -EINVAL;
+	}
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DESTROY_SRQ);
+	in.srqn = cpu_to_be32(srq->srqn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	if (atomic_dec_and_test(&srq->refcount))
+		complete(&srq->free);
+	wait_for_completion(&srq->free);
+
+	return 0;
+}
+EXPORT_SYMBOL(mlx5_core_destroy_srq);
+
+int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			struct mlx5_query_srq_mbox_out *out)
+{
+	struct mlx5_query_srq_mbox_in in;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(out, 0, sizeof(*out));
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_SRQ);
+	in.srqn = cpu_to_be32(srq->srqn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), out, sizeof(*out));
+	if (err)
+		return err;
+
+	if (out->hdr.status)
+		return mlx5_cmd_status_to_err(&out->hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_query_srq);
+
+int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+		      u16 lwm, int is_srq)
+{
+	struct mlx5_arm_srq_mbox_in	in;
+	struct mlx5_arm_srq_mbox_out	out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ARM_RQ);
+	in.hdr.opmod = cpu_to_be16(!!is_srq);
+	in.srqn = cpu_to_be32(srq->srqn);
+	in.lwm = cpu_to_be16(lwm);
+
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		return err;
+
+	if (out.hdr.status)
+		return mlx5_cmd_status_to_err(&out.hdr);
+
+	return err;
+}
+EXPORT_SYMBOL(mlx5_core_arm_srq);
+
+void mlx5_init_srq_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_srq_table *table = &dev->priv.srq_table;
+
+	spin_lock_init(&table->lock);
+	INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+}
+
+void mlx5_cleanup_srq_table(struct mlx5_core_dev *dev)
+{
+	/* nothing */
+}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/uar.c b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
new file mode 100644
index 000000000000..71d4a3937200
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx5/core/uar.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cmd.h>
+#include "mlx5_core.h"
+
+enum {
+	NUM_DRIVER_UARS		= 4,
+	NUM_LOW_LAT_UUARS	= 4,
+};
+
+
+struct mlx5_alloc_uar_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_alloc_uar_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			uarn;
+	u8			rsvd[4];
+};
+
+struct mlx5_free_uar_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			uarn;
+	u8			rsvd[4];
+};
+
+struct mlx5_free_uar_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn)
+{
+	struct mlx5_alloc_uar_mbox_in	in;
+	struct mlx5_alloc_uar_mbox_out	out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	memset(&out, 0, sizeof(out));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_ALLOC_UAR);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		goto ex;
+
+	if (out.hdr.status) {
+		err = mlx5_cmd_status_to_err(&out.hdr);
+		goto ex;
+	}
+
+	*uarn = be32_to_cpu(out.uarn) & 0xffffff;
+
+ex:
+	return err;
+}
+EXPORT_SYMBOL(mlx5_cmd_alloc_uar);
+
+int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn)
+{
+	struct mlx5_free_uar_mbox_in	in;
+	struct mlx5_free_uar_mbox_out	out;
+	int err;
+
+	memset(&in, 0, sizeof(in));
+	in.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_DEALLOC_UAR);
+	in.uarn = cpu_to_be32(uarn);
+	err = mlx5_cmd_exec(dev, &in, sizeof(in), &out, sizeof(out));
+	if (err)
+		goto ex;
+
+	if (out.hdr.status)
+		err = mlx5_cmd_status_to_err(&out.hdr);
+
+ex:
+	return err;
+}
+EXPORT_SYMBOL(mlx5_cmd_free_uar);
+
+static int need_uuar_lock(int uuarn)
+{
+	int tot_uuars = NUM_DRIVER_UARS * MLX5_BF_REGS_PER_PAGE;
+
+	if (uuarn == 0 || tot_uuars - NUM_LOW_LAT_UUARS)
+		return 0;
+
+	return 1;
+}
+
+int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari)
+{
+	int tot_uuars = NUM_DRIVER_UARS * MLX5_BF_REGS_PER_PAGE;
+	struct mlx5_bf *bf;
+	phys_addr_t addr;
+	int err;
+	int i;
+
+	uuari->num_uars = NUM_DRIVER_UARS;
+	uuari->num_low_latency_uuars = NUM_LOW_LAT_UUARS;
+
+	mutex_init(&uuari->lock);
+	uuari->uars = kcalloc(uuari->num_uars, sizeof(*uuari->uars), GFP_KERNEL);
+	if (!uuari->uars)
+		return -ENOMEM;
+
+	uuari->bfs = kcalloc(tot_uuars, sizeof(*uuari->bfs), GFP_KERNEL);
+	if (!uuari->bfs) {
+		err = -ENOMEM;
+		goto out_uars;
+	}
+
+	uuari->bitmap = kcalloc(BITS_TO_LONGS(tot_uuars), sizeof(*uuari->bitmap),
+				GFP_KERNEL);
+	if (!uuari->bitmap) {
+		err = -ENOMEM;
+		goto out_bfs;
+	}
+
+	uuari->count = kcalloc(tot_uuars, sizeof(*uuari->count), GFP_KERNEL);
+	if (!uuari->count) {
+		err = -ENOMEM;
+		goto out_bitmap;
+	}
+
+	for (i = 0; i < uuari->num_uars; i++) {
+		err = mlx5_cmd_alloc_uar(dev, &uuari->uars[i].index);
+		if (err)
+			goto out_count;
+
+		addr = dev->iseg_base + ((phys_addr_t)(uuari->uars[i].index) << PAGE_SHIFT);
+		uuari->uars[i].map = ioremap(addr, PAGE_SIZE);
+		if (!uuari->uars[i].map) {
+			mlx5_cmd_free_uar(dev, uuari->uars[i].index);
+			goto out_count;
+		}
+		mlx5_core_dbg(dev, "allocated uar index 0x%x, mmaped at %p\n",
+			      uuari->uars[i].index, uuari->uars[i].map);
+	}
+
+	for (i = 0; i < tot_uuars; i++) {
+		bf = &uuari->bfs[i];
+
+		bf->buf_size = dev->caps.bf_reg_size / 2;
+		bf->uar = &uuari->uars[i / MLX5_BF_REGS_PER_PAGE];
+		bf->regreg = uuari->uars[i / MLX5_BF_REGS_PER_PAGE].map;
+		bf->reg = NULL; /* Add WC support */
+		bf->offset = (i % MLX5_BF_REGS_PER_PAGE) * dev->caps.bf_reg_size +
+			MLX5_BF_OFFSET;
+		bf->need_lock = need_uuar_lock(i);
+		spin_lock_init(&bf->lock);
+		spin_lock_init(&bf->lock32);
+		bf->uuarn = i;
+	}
+
+	return 0;
+
+out_count:
+	for (i--; i >= 0; i--) {
+		iounmap(uuari->uars[i].map);
+		mlx5_cmd_free_uar(dev, uuari->uars[i].index);
+	}
+	kfree(uuari->count);
+
+out_bitmap:
+	kfree(uuari->bitmap);
+
+out_bfs:
+	kfree(uuari->bfs);
+
+out_uars:
+	kfree(uuari->uars);
+	return err;
+}
+
+int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari)
+{
+	int i = uuari->num_uars;
+
+	for (i--; i >= 0; i--) {
+		iounmap(uuari->uars[i].map);
+		mlx5_cmd_free_uar(dev, uuari->uars[i].index);
+	}
+
+	kfree(uuari->count);
+	kfree(uuari->bitmap);
+	kfree(uuari->bfs);
+	kfree(uuari->uars);
+
+	return 0;
+}
diff --git a/include/linux/mlx5/cmd.h b/include/linux/mlx5/cmd.h
new file mode 100644
index 000000000000..2826a4b6071e
--- /dev/null
+++ b/include/linux/mlx5/cmd.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_CMD_H
+#define MLX5_CMD_H
+
+#include <linux/types.h>
+
+struct manage_pages_layout {
+	u64	ptr;
+	u32	reserved;
+	u16	num_entries;
+	u16	func_id;
+};
+
+
+struct mlx5_cmd_alloc_uar_imm_out {
+	u32	rsvd[3];
+	u32	uarn;
+};
+
+#endif /* MLX5_CMD_H */
diff --git a/include/linux/mlx5/cq.h b/include/linux/mlx5/cq.h
new file mode 100644
index 000000000000..3db67f73d96d
--- /dev/null
+++ b/include/linux/mlx5/cq.h
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_CORE_CQ_H
+#define MLX5_CORE_CQ_H
+
+#include <rdma/ib_verbs.h>
+#include <linux/mlx5/driver.h>
+
+
+struct mlx5_core_cq {
+	u32			cqn;
+	int			cqe_sz;
+	__be32		       *set_ci_db;
+	__be32		       *arm_db;
+	atomic_t		refcount;
+	struct completion	free;
+	unsigned		vector;
+	int			irqn;
+	void (*comp)		(struct mlx5_core_cq *);
+	void (*event)		(struct mlx5_core_cq *, enum mlx5_event);
+	struct mlx5_uar	       *uar;
+	u32			cons_index;
+	unsigned		arm_sn;
+	struct mlx5_rsc_debug	*dbg;
+	int			pid;
+};
+
+
+enum {
+	MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR		= 0x01,
+	MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR		= 0x02,
+	MLX5_CQE_SYNDROME_LOCAL_PROT_ERR		= 0x04,
+	MLX5_CQE_SYNDROME_WR_FLUSH_ERR			= 0x05,
+	MLX5_CQE_SYNDROME_MW_BIND_ERR			= 0x06,
+	MLX5_CQE_SYNDROME_BAD_RESP_ERR			= 0x10,
+	MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR		= 0x11,
+	MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR		= 0x12,
+	MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR		= 0x13,
+	MLX5_CQE_SYNDROME_REMOTE_OP_ERR			= 0x14,
+	MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR	= 0x15,
+	MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR		= 0x16,
+	MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR		= 0x22,
+};
+
+enum {
+	MLX5_CQE_OWNER_MASK	= 1,
+	MLX5_CQE_REQ		= 0,
+	MLX5_CQE_RESP_WR_IMM	= 1,
+	MLX5_CQE_RESP_SEND	= 2,
+	MLX5_CQE_RESP_SEND_IMM	= 3,
+	MLX5_CQE_RESP_SEND_INV	= 4,
+	MLX5_CQE_RESIZE_CQ	= 0xff, /* TBD */
+	MLX5_CQE_REQ_ERR	= 13,
+	MLX5_CQE_RESP_ERR	= 14,
+};
+
+enum {
+	MLX5_CQ_MODIFY_RESEIZE = 0,
+	MLX5_CQ_MODIFY_MODER = 1,
+	MLX5_CQ_MODIFY_MAPPING = 2,
+};
+
+struct mlx5_cq_modify_params {
+	int	type;
+	union {
+		struct {
+			u32	page_offset;
+			u8	log_cq_size;
+		} resize;
+
+		struct {
+		} moder;
+
+		struct {
+		} mapping;
+	} params;
+};
+
+enum {
+	CQE_SIZE_64 = 0,
+	CQE_SIZE_128 = 1,
+};
+
+static inline int cqe_sz_to_mlx_sz(u8 size)
+{
+	return size == 64 ? CQE_SIZE_64 : CQE_SIZE_128;
+}
+
+static inline void mlx5_cq_set_ci(struct mlx5_core_cq *cq)
+{
+	*cq->set_ci_db = cpu_to_be32(cq->cons_index & 0xffffff);
+}
+
+enum {
+	MLX5_CQ_DB_REQ_NOT_SOL		= 1 << 24,
+	MLX5_CQ_DB_REQ_NOT		= 0 << 24
+};
+
+static inline void mlx5_cq_arm(struct mlx5_core_cq *cq, u32 cmd,
+			       void __iomem *uar_page,
+			       spinlock_t *doorbell_lock)
+{
+	__be32 doorbell[2];
+	u32 sn;
+	u32 ci;
+
+	sn = cq->arm_sn & 3;
+	ci = cq->cons_index & 0xffffff;
+
+	*cq->arm_db = cpu_to_be32(sn << 28 | cmd | ci);
+
+	/* Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	doorbell[0] = cpu_to_be32(sn << 28 | cmd | ci);
+	doorbell[1] = cpu_to_be32(cq->cqn);
+
+	mlx5_write64(doorbell, uar_page + MLX5_CQ_DOORBELL, doorbell_lock);
+}
+
+int mlx5_init_cq_table(struct mlx5_core_dev *dev);
+void mlx5_cleanup_cq_table(struct mlx5_core_dev *dev);
+int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			struct mlx5_create_cq_mbox_in *in, int inlen);
+int mlx5_core_destroy_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
+int mlx5_core_query_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+		       struct mlx5_query_cq_mbox_out *out);
+int mlx5_core_modify_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq,
+			int type, struct mlx5_cq_modify_params *params);
+int mlx5_debug_cq_add(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
+void mlx5_debug_cq_remove(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq);
+
+#endif /* MLX5_CORE_CQ_H */
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
new file mode 100644
index 000000000000..51390915e538
--- /dev/null
+++ b/include/linux/mlx5/device.h
@@ -0,0 +1,893 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_DEVICE_H
+#define MLX5_DEVICE_H
+
+#include <linux/types.h>
+#include <rdma/ib_verbs.h>
+
+#if defined(__LITTLE_ENDIAN)
+#define MLX5_SET_HOST_ENDIANNESS	0
+#elif defined(__BIG_ENDIAN)
+#define MLX5_SET_HOST_ENDIANNESS	0x80
+#else
+#error Host endianness not defined
+#endif
+
+enum {
+	MLX5_MAX_COMMANDS		= 32,
+	MLX5_CMD_DATA_BLOCK_SIZE	= 512,
+	MLX5_PCI_CMD_XPORT		= 7,
+};
+
+enum {
+	MLX5_EXTENDED_UD_AV		= 0x80000000,
+};
+
+enum {
+	MLX5_CQ_STATE_ARMED		= 9,
+	MLX5_CQ_STATE_ALWAYS_ARMED	= 0xb,
+	MLX5_CQ_STATE_FIRED		= 0xa,
+};
+
+enum {
+	MLX5_STAT_RATE_OFFSET	= 5,
+};
+
+enum {
+	MLX5_INLINE_SEG = 0x80000000,
+};
+
+enum {
+	MLX5_PERM_LOCAL_READ	= 1 << 2,
+	MLX5_PERM_LOCAL_WRITE	= 1 << 3,
+	MLX5_PERM_REMOTE_READ	= 1 << 4,
+	MLX5_PERM_REMOTE_WRITE	= 1 << 5,
+	MLX5_PERM_ATOMIC	= 1 << 6,
+	MLX5_PERM_UMR_EN	= 1 << 7,
+};
+
+enum {
+	MLX5_PCIE_CTRL_SMALL_FENCE	= 1 << 0,
+	MLX5_PCIE_CTRL_RELAXED_ORDERING	= 1 << 2,
+	MLX5_PCIE_CTRL_NO_SNOOP		= 1 << 3,
+	MLX5_PCIE_CTRL_TLP_PROCE_EN	= 1 << 6,
+	MLX5_PCIE_CTRL_TPH_MASK		= 3 << 4,
+};
+
+enum {
+	MLX5_ACCESS_MODE_PA	= 0,
+	MLX5_ACCESS_MODE_MTT	= 1,
+	MLX5_ACCESS_MODE_KLM	= 2
+};
+
+enum {
+	MLX5_MKEY_REMOTE_INVAL	= 1 << 24,
+	MLX5_MKEY_FLAG_SYNC_UMR = 1 << 29,
+	MLX5_MKEY_BSF_EN	= 1 << 30,
+	MLX5_MKEY_LEN64		= 1 << 31,
+};
+
+enum {
+	MLX5_EN_RD	= (u64)1,
+	MLX5_EN_WR	= (u64)2
+};
+
+enum {
+	MLX5_BF_REGS_PER_PAGE	= 4,
+	MLX5_MAX_UAR_PAGES	= 1 << 8,
+	MLX5_MAX_UUARS		= MLX5_MAX_UAR_PAGES * MLX5_BF_REGS_PER_PAGE,
+};
+
+enum {
+	MLX5_MKEY_MASK_LEN		= 1ull << 0,
+	MLX5_MKEY_MASK_PAGE_SIZE	= 1ull << 1,
+	MLX5_MKEY_MASK_START_ADDR	= 1ull << 6,
+	MLX5_MKEY_MASK_PD		= 1ull << 7,
+	MLX5_MKEY_MASK_EN_RINVAL	= 1ull << 8,
+	MLX5_MKEY_MASK_BSF_EN		= 1ull << 12,
+	MLX5_MKEY_MASK_KEY		= 1ull << 13,
+	MLX5_MKEY_MASK_QPN		= 1ull << 14,
+	MLX5_MKEY_MASK_LR		= 1ull << 17,
+	MLX5_MKEY_MASK_LW		= 1ull << 18,
+	MLX5_MKEY_MASK_RR		= 1ull << 19,
+	MLX5_MKEY_MASK_RW		= 1ull << 20,
+	MLX5_MKEY_MASK_A		= 1ull << 21,
+	MLX5_MKEY_MASK_SMALL_FENCE	= 1ull << 23,
+	MLX5_MKEY_MASK_FREE		= 1ull << 29,
+};
+
+enum mlx5_event {
+	MLX5_EVENT_TYPE_COMP		   = 0x0,
+
+	MLX5_EVENT_TYPE_PATH_MIG	   = 0x01,
+	MLX5_EVENT_TYPE_COMM_EST	   = 0x02,
+	MLX5_EVENT_TYPE_SQ_DRAINED	   = 0x03,
+	MLX5_EVENT_TYPE_SRQ_LAST_WQE	   = 0x13,
+	MLX5_EVENT_TYPE_SRQ_RQ_LIMIT	   = 0x14,
+
+	MLX5_EVENT_TYPE_CQ_ERROR	   = 0x04,
+	MLX5_EVENT_TYPE_WQ_CATAS_ERROR	   = 0x05,
+	MLX5_EVENT_TYPE_PATH_MIG_FAILED	   = 0x07,
+	MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MLX5_EVENT_TYPE_WQ_ACCESS_ERROR	   = 0x11,
+	MLX5_EVENT_TYPE_SRQ_CATAS_ERROR	   = 0x12,
+
+	MLX5_EVENT_TYPE_INTERNAL_ERROR	   = 0x08,
+	MLX5_EVENT_TYPE_PORT_CHANGE	   = 0x09,
+	MLX5_EVENT_TYPE_GPIO_EVENT	   = 0x15,
+	MLX5_EVENT_TYPE_REMOTE_CONFIG	   = 0x19,
+
+	MLX5_EVENT_TYPE_DB_BF_CONGESTION   = 0x1a,
+	MLX5_EVENT_TYPE_STALL_EVENT	   = 0x1b,
+
+	MLX5_EVENT_TYPE_CMD		   = 0x0a,
+	MLX5_EVENT_TYPE_PAGE_REQUEST	   = 0xb,
+};
+
+enum {
+	MLX5_PORT_CHANGE_SUBTYPE_DOWN		= 1,
+	MLX5_PORT_CHANGE_SUBTYPE_ACTIVE		= 4,
+	MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED	= 5,
+	MLX5_PORT_CHANGE_SUBTYPE_LID		= 6,
+	MLX5_PORT_CHANGE_SUBTYPE_PKEY		= 7,
+	MLX5_PORT_CHANGE_SUBTYPE_GUID		= 8,
+	MLX5_PORT_CHANGE_SUBTYPE_CLIENT_REREG	= 9,
+};
+
+enum {
+	MLX5_DEV_CAP_FLAG_RC		= 1LL <<  0,
+	MLX5_DEV_CAP_FLAG_UC		= 1LL <<  1,
+	MLX5_DEV_CAP_FLAG_UD		= 1LL <<  2,
+	MLX5_DEV_CAP_FLAG_XRC		= 1LL <<  3,
+	MLX5_DEV_CAP_FLAG_SRQ		= 1LL <<  6,
+	MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR	= 1LL <<  8,
+	MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR	= 1LL <<  9,
+	MLX5_DEV_CAP_FLAG_APM		= 1LL << 17,
+	MLX5_DEV_CAP_FLAG_ATOMIC	= 1LL << 18,
+	MLX5_DEV_CAP_FLAG_ON_DMND_PG	= 1LL << 24,
+	MLX5_DEV_CAP_FLAG_RESIZE_SRQ	= 1LL << 32,
+	MLX5_DEV_CAP_FLAG_REMOTE_FENCE	= 1LL << 38,
+	MLX5_DEV_CAP_FLAG_TLP_HINTS	= 1LL << 39,
+	MLX5_DEV_CAP_FLAG_SIG_HAND_OVER	= 1LL << 40,
+	MLX5_DEV_CAP_FLAG_DCT		= 1LL << 41,
+	MLX5_DEV_CAP_FLAG_CMDIF_CSUM	= 1LL << 46,
+};
+
+enum {
+	MLX5_OPCODE_NOP			= 0x00,
+	MLX5_OPCODE_SEND_INVAL		= 0x01,
+	MLX5_OPCODE_RDMA_WRITE		= 0x08,
+	MLX5_OPCODE_RDMA_WRITE_IMM	= 0x09,
+	MLX5_OPCODE_SEND		= 0x0a,
+	MLX5_OPCODE_SEND_IMM		= 0x0b,
+	MLX5_OPCODE_RDMA_READ		= 0x10,
+	MLX5_OPCODE_ATOMIC_CS		= 0x11,
+	MLX5_OPCODE_ATOMIC_FA		= 0x12,
+	MLX5_OPCODE_ATOMIC_MASKED_CS	= 0x14,
+	MLX5_OPCODE_ATOMIC_MASKED_FA	= 0x15,
+	MLX5_OPCODE_BIND_MW		= 0x18,
+	MLX5_OPCODE_CONFIG_CMD		= 0x1f,
+
+	MLX5_RECV_OPCODE_RDMA_WRITE_IMM	= 0x00,
+	MLX5_RECV_OPCODE_SEND		= 0x01,
+	MLX5_RECV_OPCODE_SEND_IMM	= 0x02,
+	MLX5_RECV_OPCODE_SEND_INVAL	= 0x03,
+
+	MLX5_CQE_OPCODE_ERROR		= 0x1e,
+	MLX5_CQE_OPCODE_RESIZE		= 0x16,
+
+	MLX5_OPCODE_SET_PSV		= 0x20,
+	MLX5_OPCODE_GET_PSV		= 0x21,
+	MLX5_OPCODE_CHECK_PSV		= 0x22,
+	MLX5_OPCODE_RGET_PSV		= 0x26,
+	MLX5_OPCODE_RCHECK_PSV		= 0x27,
+
+	MLX5_OPCODE_UMR			= 0x25,
+
+};
+
+enum {
+	MLX5_SET_PORT_RESET_QKEY	= 0,
+	MLX5_SET_PORT_GUID0		= 16,
+	MLX5_SET_PORT_NODE_GUID		= 17,
+	MLX5_SET_PORT_SYS_GUID		= 18,
+	MLX5_SET_PORT_GID_TABLE		= 19,
+	MLX5_SET_PORT_PKEY_TABLE	= 20,
+};
+
+enum {
+	MLX5_MAX_PAGE_SHIFT		= 31
+};
+
+struct mlx5_inbox_hdr {
+	__be16		opcode;
+	u8		rsvd[4];
+	__be16		opmod;
+};
+
+struct mlx5_outbox_hdr {
+	u8		status;
+	u8		rsvd[3];
+	__be32		syndrome;
+};
+
+struct mlx5_cmd_query_adapter_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_cmd_query_adapter_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[24];
+	u8			intapin;
+	u8			rsvd1[13];
+	__be16			vsd_vendor_id;
+	u8			vsd[208];
+	u8			vsd_psid[16];
+};
+
+struct mlx5_hca_cap {
+	u8	rsvd1[16];
+	u8	log_max_srq_sz;
+	u8	log_max_qp_sz;
+	u8	rsvd2;
+	u8	log_max_qp;
+	u8	log_max_strq_sz;
+	u8	log_max_srqs;
+	u8	rsvd4[2];
+	u8	rsvd5;
+	u8	log_max_cq_sz;
+	u8	rsvd6;
+	u8	log_max_cq;
+	u8	log_max_eq_sz;
+	u8	log_max_mkey;
+	u8	rsvd7;
+	u8	log_max_eq;
+	u8	max_indirection;
+	u8	log_max_mrw_sz;
+	u8	log_max_bsf_list_sz;
+	u8	log_max_klm_list_sz;
+	u8	rsvd_8_0;
+	u8	log_max_ra_req_dc;
+	u8	rsvd_8_1;
+	u8	log_max_ra_res_dc;
+	u8	rsvd9;
+	u8	log_max_ra_req_qp;
+	u8	rsvd10;
+	u8	log_max_ra_res_qp;
+	u8	rsvd11[4];
+	__be16	max_qp_count;
+	__be16	rsvd12;
+	u8	rsvd13;
+	u8	local_ca_ack_delay;
+	u8	rsvd14;
+	u8	num_ports;
+	u8	log_max_msg;
+	u8	rsvd15[3];
+	__be16	stat_rate_support;
+	u8	rsvd16[2];
+	__be64	flags;
+	u8	rsvd17;
+	u8	uar_sz;
+	u8	rsvd18;
+	u8	log_pg_sz;
+	__be16	bf_log_bf_reg_size;
+	u8	rsvd19[4];
+	__be16	max_desc_sz_sq;
+	u8	rsvd20[2];
+	__be16	max_desc_sz_rq;
+	u8	rsvd21[2];
+	__be16	max_desc_sz_sq_dc;
+	u8	rsvd22[4];
+	__be16	max_qp_mcg;
+	u8	rsvd23;
+	u8	log_max_mcg;
+	u8	rsvd24;
+	u8	log_max_pd;
+	u8	rsvd25;
+	u8	log_max_xrcd;
+	u8	rsvd26[40];
+	__be32  uar_page_sz;
+	u8	rsvd27[28];
+	u8	log_msx_atomic_size_qp;
+	u8	rsvd28[2];
+	u8	log_msx_atomic_size_dc;
+	u8	rsvd29[76];
+};
+
+
+struct mlx5_cmd_query_hca_cap_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+
+struct mlx5_cmd_query_hca_cap_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[8];
+	struct mlx5_hca_cap     hca_cap;
+};
+
+
+struct mlx5_cmd_set_hca_cap_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+	struct mlx5_hca_cap     hca_cap;
+};
+
+
+struct mlx5_cmd_set_hca_cap_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[8];
+};
+
+
+struct mlx5_cmd_init_hca_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd0[2];
+	__be16			profile;
+	u8			rsvd1[4];
+};
+
+struct mlx5_cmd_init_hca_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_cmd_teardown_hca_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd0[2];
+	__be16			profile;
+	u8			rsvd1[4];
+};
+
+struct mlx5_cmd_teardown_hca_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_cmd_layout {
+	u8		type;
+	u8		rsvd0[3];
+	__be32		inlen;
+	__be64		in_ptr;
+	__be32		in[4];
+	__be32		out[4];
+	__be64		out_ptr;
+	__be32		outlen;
+	u8		token;
+	u8		sig;
+	u8		rsvd1;
+	u8		status_own;
+};
+
+
+struct health_buffer {
+	__be32		assert_var[5];
+	__be32		rsvd0[3];
+	__be32		assert_exit_ptr;
+	__be32		assert_callra;
+	__be32		rsvd1[2];
+	__be32		fw_ver;
+	__be32		hw_id;
+	__be32		rsvd2;
+	u8		irisc_index;
+	u8		synd;
+	__be16		ext_sync;
+};
+
+struct mlx5_init_seg {
+	__be32			fw_rev;
+	__be32			cmdif_rev_fw_sub;
+	__be32			rsvd0[2];
+	__be32			cmdq_addr_h;
+	__be32			cmdq_addr_l_sz;
+	__be32			cmd_dbell;
+	__be32			rsvd1[121];
+	struct health_buffer	health;
+	__be32			rsvd2[884];
+	__be32			health_counter;
+	__be32			rsvd3[1023];
+	__be64			ieee1588_clk;
+	__be32			ieee1588_clk_type;
+	__be32			clr_intx;
+};
+
+struct mlx5_eqe_comp {
+	__be32	reserved[6];
+	__be32	cqn;
+};
+
+struct mlx5_eqe_qp_srq {
+	__be32	reserved[6];
+	__be32	qp_srq_n;
+};
+
+struct mlx5_eqe_cq_err {
+	__be32	cqn;
+	u8	reserved1[7];
+	u8	syndrome;
+};
+
+struct mlx5_eqe_dropped_packet {
+};
+
+struct mlx5_eqe_port_state {
+	u8	reserved0[8];
+	u8	port;
+};
+
+struct mlx5_eqe_gpio {
+	__be32	reserved0[2];
+	__be64	gpio_event;
+};
+
+struct mlx5_eqe_congestion {
+	u8	type;
+	u8	rsvd0;
+	u8	congestion_level;
+};
+
+struct mlx5_eqe_stall_vl {
+	u8	rsvd0[3];
+	u8	port_vl;
+};
+
+struct mlx5_eqe_cmd {
+	__be32	vector;
+	__be32	rsvd[6];
+};
+
+struct mlx5_eqe_page_req {
+	u8		rsvd0[2];
+	__be16		func_id;
+	u8		rsvd1[2];
+	__be16		num_pages;
+	__be32		rsvd2[5];
+};
+
+union ev_data {
+	__be32				raw[7];
+	struct mlx5_eqe_cmd		cmd;
+	struct mlx5_eqe_comp		comp;
+	struct mlx5_eqe_qp_srq		qp_srq;
+	struct mlx5_eqe_cq_err		cq_err;
+	struct mlx5_eqe_dropped_packet	dp;
+	struct mlx5_eqe_port_state	port;
+	struct mlx5_eqe_gpio		gpio;
+	struct mlx5_eqe_congestion	cong;
+	struct mlx5_eqe_stall_vl	stall_vl;
+	struct mlx5_eqe_page_req	req_pages;
+} __packed;
+
+struct mlx5_eqe {
+	u8		rsvd0;
+	u8		type;
+	u8		rsvd1;
+	u8		sub_type;
+	__be32		rsvd2[7];
+	union ev_data	data;
+	__be16		rsvd3;
+	u8		signature;
+	u8		owner;
+} __packed;
+
+struct mlx5_cmd_prot_block {
+	u8		data[MLX5_CMD_DATA_BLOCK_SIZE];
+	u8		rsvd0[48];
+	__be64		next;
+	__be32		block_num;
+	u8		rsvd1;
+	u8		token;
+	u8		ctrl_sig;
+	u8		sig;
+};
+
+struct mlx5_err_cqe {
+	u8	rsvd0[32];
+	__be32	srqn;
+	u8	rsvd1[18];
+	u8	vendor_err_synd;
+	u8	syndrome;
+	__be32	s_wqe_opcode_qpn;
+	__be16	wqe_counter;
+	u8	signature;
+	u8	op_own;
+};
+
+struct mlx5_cqe64 {
+	u8		rsvd0[17];
+	u8		ml_path;
+	u8		rsvd20[4];
+	__be16		slid;
+	__be32		flags_rqpn;
+	u8		rsvd28[4];
+	__be32		srqn;
+	__be32		imm_inval_pkey;
+	u8		rsvd40[4];
+	__be32		byte_cnt;
+	__be64		timestamp;
+	__be32		sop_drop_qpn;
+	__be16		wqe_counter;
+	u8		signature;
+	u8		op_own;
+};
+
+struct mlx5_wqe_srq_next_seg {
+	u8			rsvd0[2];
+	__be16			next_wqe_index;
+	u8			signature;
+	u8			rsvd1[11];
+};
+
+union mlx5_ext_cqe {
+	struct ib_grh	grh;
+	u8		inl[64];
+};
+
+struct mlx5_cqe128 {
+	union mlx5_ext_cqe	inl_grh;
+	struct mlx5_cqe64	cqe64;
+};
+
+struct mlx5_srq_ctx {
+	u8			state_log_sz;
+	u8			rsvd0[3];
+	__be32			flags_xrcd;
+	__be32			pgoff_cqn;
+	u8			rsvd1[4];
+	u8			log_pg_sz;
+	u8			rsvd2[7];
+	__be32			pd;
+	__be16			lwm;
+	__be16			wqe_cnt;
+	u8			rsvd3[8];
+	__be64			db_record;
+};
+
+struct mlx5_create_srq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			input_srqn;
+	u8			rsvd0[4];
+	struct mlx5_srq_ctx	ctx;
+	u8			rsvd1[208];
+	__be64			pas[0];
+};
+
+struct mlx5_create_srq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			srqn;
+	u8			rsvd[4];
+};
+
+struct mlx5_destroy_srq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			srqn;
+	u8			rsvd[4];
+};
+
+struct mlx5_destroy_srq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_query_srq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			srqn;
+	u8			rsvd0[4];
+};
+
+struct mlx5_query_srq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[8];
+	struct mlx5_srq_ctx	ctx;
+	u8			rsvd1[32];
+	__be64			pas[0];
+};
+
+struct mlx5_arm_srq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			srqn;
+	__be16			rsvd;
+	__be16			lwm;
+};
+
+struct mlx5_arm_srq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_cq_context {
+	u8			status;
+	u8			cqe_sz_flags;
+	u8			st;
+	u8			rsvd3;
+	u8			rsvd4[6];
+	__be16			page_offset;
+	__be32			log_sz_usr_page;
+	__be16			cq_period;
+	__be16			cq_max_count;
+	__be16			rsvd20;
+	__be16			c_eqn;
+	u8			log_pg_sz;
+	u8			rsvd25[7];
+	__be32			last_notified_index;
+	__be32			solicit_producer_index;
+	__be32			consumer_counter;
+	__be32			producer_counter;
+	u8			rsvd48[8];
+	__be64			db_record_addr;
+};
+
+struct mlx5_create_cq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			input_cqn;
+	u8			rsvdx[4];
+	struct mlx5_cq_context	ctx;
+	u8			rsvd6[192];
+	__be64			pas[0];
+};
+
+struct mlx5_create_cq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			cqn;
+	u8			rsvd0[4];
+};
+
+struct mlx5_destroy_cq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			cqn;
+	u8			rsvd0[4];
+};
+
+struct mlx5_destroy_cq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[8];
+};
+
+struct mlx5_query_cq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			cqn;
+	u8			rsvd0[4];
+};
+
+struct mlx5_query_cq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[8];
+	struct mlx5_cq_context	ctx;
+	u8			rsvd6[16];
+	__be64			pas[0];
+};
+
+struct mlx5_eq_context {
+	u8			status;
+	u8			ec_oi;
+	u8			st;
+	u8			rsvd2[7];
+	__be16			page_pffset;
+	__be32			log_sz_usr_page;
+	u8			rsvd3[7];
+	u8			intr;
+	u8			log_page_size;
+	u8			rsvd4[15];
+	__be32			consumer_counter;
+	__be32			produser_counter;
+	u8			rsvd5[16];
+};
+
+struct mlx5_create_eq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd0[3];
+	u8			input_eqn;
+	u8			rsvd1[4];
+	struct mlx5_eq_context	ctx;
+	u8			rsvd2[8];
+	__be64			events_mask;
+	u8			rsvd3[176];
+	__be64			pas[0];
+};
+
+struct mlx5_create_eq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[3];
+	u8			eq_number;
+	u8			rsvd1[4];
+};
+
+struct mlx5_destroy_eq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd0[3];
+	u8			eqn;
+	u8			rsvd1[4];
+};
+
+struct mlx5_destroy_eq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_map_eq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be64			mask;
+	u8			mu;
+	u8			rsvd0[2];
+	u8			eqn;
+	u8			rsvd1[24];
+};
+
+struct mlx5_map_eq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_query_eq_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd0[3];
+	u8			eqn;
+	u8			rsvd1[4];
+};
+
+struct mlx5_query_eq_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+	struct mlx5_eq_context	ctx;
+};
+
+struct mlx5_mkey_seg {
+	/* This is a two bit field occupying bits 31-30.
+	 * bit 31 is always 0,
+	 * bit 30 is zero for regular MRs and 1 (e.g free) for UMRs that do not have tanslation
+	 */
+	u8		status;
+	u8		pcie_control;
+	u8		flags;
+	u8		version;
+	__be32		qpn_mkey7_0;
+	u8		rsvd1[4];
+	__be32		flags_pd;
+	__be64		start_addr;
+	__be64		len;
+	__be32		bsfs_octo_size;
+	u8		rsvd2[16];
+	__be32		xlt_oct_size;
+	u8		rsvd3[3];
+	u8		log2_page_size;
+	u8		rsvd4[4];
+};
+
+struct mlx5_query_special_ctxs_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_query_special_ctxs_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			dump_fill_mkey;
+	__be32			reserved_lkey;
+};
+
+struct mlx5_create_mkey_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			input_mkey_index;
+	u8			rsvd0[4];
+	struct mlx5_mkey_seg	seg;
+	u8			rsvd1[16];
+	__be32			xlat_oct_act_size;
+	__be32			bsf_coto_act_size;
+	u8			rsvd2[168];
+	__be64			pas[0];
+};
+
+struct mlx5_create_mkey_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			mkey;
+	u8			rsvd[4];
+};
+
+struct mlx5_destroy_mkey_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			mkey;
+	u8			rsvd[4];
+};
+
+struct mlx5_destroy_mkey_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_query_mkey_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			mkey;
+};
+
+struct mlx5_query_mkey_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be64			pas[0];
+};
+
+struct mlx5_modify_mkey_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			mkey;
+	__be64			pas[0];
+};
+
+struct mlx5_modify_mkey_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+};
+
+struct mlx5_dump_mkey_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+};
+
+struct mlx5_dump_mkey_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			mkey;
+};
+
+struct mlx5_mad_ifc_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be16			remote_lid;
+	u8			rsvd0;
+	u8			port;
+	u8			rsvd1[4];
+	u8			data[256];
+};
+
+struct mlx5_mad_ifc_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+	u8			data[256];
+};
+
+struct mlx5_access_reg_mbox_in {
+	struct mlx5_inbox_hdr		hdr;
+	u8				rsvd0[2];
+	__be16				register_id;
+	__be32				arg;
+	__be32				data[0];
+};
+
+struct mlx5_access_reg_mbox_out {
+	struct mlx5_outbox_hdr		hdr;
+	u8				rsvd[8];
+	__be32				data[0];
+};
+
+#define MLX5_ATTR_EXTENDED_PORT_INFO	cpu_to_be16(0xff90)
+
+enum {
+	MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO	= 1 <<  0
+};
+
+#endif /* MLX5_DEVICE_H */
diff --git a/include/linux/mlx5/doorbell.h b/include/linux/mlx5/doorbell.h
new file mode 100644
index 000000000000..163a818411e7
--- /dev/null
+++ b/include/linux/mlx5/doorbell.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_DOORBELL_H
+#define MLX5_DOORBELL_H
+
+#define MLX5_BF_OFFSET	      0x800
+#define MLX5_CQ_DOORBELL      0x20
+
+#if BITS_PER_LONG == 64
+/* Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MLX5_DECLARE_DOORBELL_LOCK(name)
+#define MLX5_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MLX5_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mlx5_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	__raw_writeq(*(u64 *)val, dest);
+}
+
+#else
+
+/* Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MLX5_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MLX5_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MLX5_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mlx5_write64(__be32 val[2], void __iomem *dest,
+				spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel((__force u32) val[0], dest);
+	__raw_writel((__force u32) val[1], dest + 4);
+	spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+#endif
+
+#endif /* MLX5_DOORBELL_H */
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
new file mode 100644
index 000000000000..e47f1e4c9b03
--- /dev/null
+++ b/include/linux/mlx5/driver.h
@@ -0,0 +1,769 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_DRIVER_H
+#define MLX5_DRIVER_H
+
+#include <linux/kernel.h>
+#include <linux/completion.h>
+#include <linux/pci.h>
+#include <linux/spinlock_types.h>
+#include <linux/semaphore.h>
+#include <linux/vmalloc.h>
+#include <linux/radix-tree.h>
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/doorbell.h>
+
+enum {
+	MLX5_BOARD_ID_LEN = 64,
+	MLX5_MAX_NAME_LEN = 16,
+};
+
+enum {
+	/* one minute for the sake of bringup. Generally, commands must always
+	 * complete and we may need to increase this timeout value
+	 */
+	MLX5_CMD_TIMEOUT_MSEC	= 7200 * 1000,
+	MLX5_CMD_WQ_MAX_NAME	= 32,
+};
+
+enum {
+	CMD_OWNER_SW		= 0x0,
+	CMD_OWNER_HW		= 0x1,
+	CMD_STATUS_SUCCESS	= 0,
+};
+
+enum mlx5_sqp_t {
+	MLX5_SQP_SMI		= 0,
+	MLX5_SQP_GSI		= 1,
+	MLX5_SQP_IEEE_1588	= 2,
+	MLX5_SQP_SNIFFER	= 3,
+	MLX5_SQP_SYNC_UMR	= 4,
+};
+
+enum {
+	MLX5_MAX_PORTS	= 2,
+};
+
+enum {
+	MLX5_EQ_VEC_PAGES	 = 0,
+	MLX5_EQ_VEC_CMD		 = 1,
+	MLX5_EQ_VEC_ASYNC	 = 2,
+	MLX5_EQ_VEC_COMP_BASE,
+};
+
+enum {
+	MLX5_MAX_EQ_NAME	= 20
+};
+
+enum {
+	MLX5_ATOMIC_MODE_IB_COMP	= 1 << 16,
+	MLX5_ATOMIC_MODE_CX		= 2 << 16,
+	MLX5_ATOMIC_MODE_8B		= 3 << 16,
+	MLX5_ATOMIC_MODE_16B		= 4 << 16,
+	MLX5_ATOMIC_MODE_32B		= 5 << 16,
+	MLX5_ATOMIC_MODE_64B		= 6 << 16,
+	MLX5_ATOMIC_MODE_128B		= 7 << 16,
+	MLX5_ATOMIC_MODE_256B		= 8 << 16,
+};
+
+enum {
+	MLX5_CMD_OP_QUERY_HCA_CAP		= 0x100,
+	MLX5_CMD_OP_QUERY_ADAPTER		= 0x101,
+	MLX5_CMD_OP_INIT_HCA			= 0x102,
+	MLX5_CMD_OP_TEARDOWN_HCA		= 0x103,
+	MLX5_CMD_OP_QUERY_PAGES			= 0x107,
+	MLX5_CMD_OP_MANAGE_PAGES		= 0x108,
+	MLX5_CMD_OP_SET_HCA_CAP			= 0x109,
+
+	MLX5_CMD_OP_CREATE_MKEY			= 0x200,
+	MLX5_CMD_OP_QUERY_MKEY			= 0x201,
+	MLX5_CMD_OP_DESTROY_MKEY		= 0x202,
+	MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS	= 0x203,
+
+	MLX5_CMD_OP_CREATE_EQ			= 0x301,
+	MLX5_CMD_OP_DESTROY_EQ			= 0x302,
+	MLX5_CMD_OP_QUERY_EQ			= 0x303,
+
+	MLX5_CMD_OP_CREATE_CQ			= 0x400,
+	MLX5_CMD_OP_DESTROY_CQ			= 0x401,
+	MLX5_CMD_OP_QUERY_CQ			= 0x402,
+	MLX5_CMD_OP_MODIFY_CQ			= 0x403,
+
+	MLX5_CMD_OP_CREATE_QP			= 0x500,
+	MLX5_CMD_OP_DESTROY_QP			= 0x501,
+	MLX5_CMD_OP_RST2INIT_QP			= 0x502,
+	MLX5_CMD_OP_INIT2RTR_QP			= 0x503,
+	MLX5_CMD_OP_RTR2RTS_QP			= 0x504,
+	MLX5_CMD_OP_RTS2RTS_QP			= 0x505,
+	MLX5_CMD_OP_SQERR2RTS_QP		= 0x506,
+	MLX5_CMD_OP_2ERR_QP			= 0x507,
+	MLX5_CMD_OP_RTS2SQD_QP			= 0x508,
+	MLX5_CMD_OP_SQD2RTS_QP			= 0x509,
+	MLX5_CMD_OP_2RST_QP			= 0x50a,
+	MLX5_CMD_OP_QUERY_QP			= 0x50b,
+	MLX5_CMD_OP_CONF_SQP			= 0x50c,
+	MLX5_CMD_OP_MAD_IFC			= 0x50d,
+	MLX5_CMD_OP_INIT2INIT_QP		= 0x50e,
+	MLX5_CMD_OP_SUSPEND_QP			= 0x50f,
+	MLX5_CMD_OP_UNSUSPEND_QP		= 0x510,
+	MLX5_CMD_OP_SQD2SQD_QP			= 0x511,
+	MLX5_CMD_OP_ALLOC_QP_COUNTER_SET	= 0x512,
+	MLX5_CMD_OP_DEALLOC_QP_COUNTER_SET	= 0x513,
+	MLX5_CMD_OP_QUERY_QP_COUNTER_SET	= 0x514,
+
+	MLX5_CMD_OP_CREATE_PSV			= 0x600,
+	MLX5_CMD_OP_DESTROY_PSV			= 0x601,
+	MLX5_CMD_OP_QUERY_PSV			= 0x602,
+	MLX5_CMD_OP_QUERY_SIG_RULE_TABLE	= 0x603,
+	MLX5_CMD_OP_QUERY_BLOCK_SIZE_TABLE	= 0x604,
+
+	MLX5_CMD_OP_CREATE_SRQ			= 0x700,
+	MLX5_CMD_OP_DESTROY_SRQ			= 0x701,
+	MLX5_CMD_OP_QUERY_SRQ			= 0x702,
+	MLX5_CMD_OP_ARM_RQ			= 0x703,
+	MLX5_CMD_OP_RESIZE_SRQ			= 0x704,
+
+	MLX5_CMD_OP_ALLOC_PD			= 0x800,
+	MLX5_CMD_OP_DEALLOC_PD			= 0x801,
+	MLX5_CMD_OP_ALLOC_UAR			= 0x802,
+	MLX5_CMD_OP_DEALLOC_UAR			= 0x803,
+
+	MLX5_CMD_OP_ATTACH_TO_MCG		= 0x806,
+	MLX5_CMD_OP_DETACH_FROM_MCG		= 0x807,
+
+
+	MLX5_CMD_OP_ALLOC_XRCD			= 0x80e,
+	MLX5_CMD_OP_DEALLOC_XRCD		= 0x80f,
+
+	MLX5_CMD_OP_ACCESS_REG			= 0x805,
+	MLX5_CMD_OP_MAX				= 0x810,
+};
+
+enum {
+	MLX5_REG_PCAP		 = 0x5001,
+	MLX5_REG_PMTU		 = 0x5003,
+	MLX5_REG_PTYS		 = 0x5004,
+	MLX5_REG_PAOS		 = 0x5006,
+	MLX5_REG_PMAOS		 = 0x5012,
+	MLX5_REG_PUDE		 = 0x5009,
+	MLX5_REG_PMPE		 = 0x5010,
+	MLX5_REG_PELC		 = 0x500e,
+	MLX5_REG_PMLP		 = 0, /* TBD */
+	MLX5_REG_NODE_DESC	 = 0x6001,
+	MLX5_REG_HOST_ENDIANNESS = 0x7004,
+};
+
+enum dbg_rsc_type {
+	MLX5_DBG_RSC_QP,
+	MLX5_DBG_RSC_EQ,
+	MLX5_DBG_RSC_CQ,
+};
+
+struct mlx5_field_desc {
+	struct dentry	       *dent;
+	int			i;
+};
+
+struct mlx5_rsc_debug {
+	struct mlx5_core_dev   *dev;
+	void		       *object;
+	enum dbg_rsc_type	type;
+	struct dentry	       *root;
+	struct mlx5_field_desc	fields[0];
+};
+
+enum mlx5_dev_event {
+	MLX5_DEV_EVENT_SYS_ERROR,
+	MLX5_DEV_EVENT_PORT_UP,
+	MLX5_DEV_EVENT_PORT_DOWN,
+	MLX5_DEV_EVENT_PORT_INITIALIZED,
+	MLX5_DEV_EVENT_LID_CHANGE,
+	MLX5_DEV_EVENT_PKEY_CHANGE,
+	MLX5_DEV_EVENT_GUID_CHANGE,
+	MLX5_DEV_EVENT_CLIENT_REREG,
+};
+
+struct mlx5_uuar_info {
+	struct mlx5_uar	       *uars;
+	int			num_uars;
+	int			num_low_latency_uuars;
+	unsigned long	       *bitmap;
+	unsigned int	       *count;
+	struct mlx5_bf	       *bfs;
+
+	/*
+	 * protect uuar allocation data structs
+	 */
+	struct mutex		lock;
+};
+
+struct mlx5_bf {
+	void __iomem	       *reg;
+	void __iomem	       *regreg;
+	int			buf_size;
+	struct mlx5_uar	       *uar;
+	unsigned long		offset;
+	int			need_lock;
+	/* protect blue flame buffer selection when needed
+	 */
+	spinlock_t		lock;
+
+	/* serialize 64 bit writes when done as two 32 bit accesses
+	 */
+	spinlock_t		lock32;
+	int			uuarn;
+};
+
+struct mlx5_cmd_first {
+	__be32		data[4];
+};
+
+struct mlx5_cmd_msg {
+	struct list_head		list;
+	struct cache_ent	       *cache;
+	u32				len;
+	struct mlx5_cmd_first		first;
+	struct mlx5_cmd_mailbox	       *next;
+};
+
+struct mlx5_cmd_debug {
+	struct dentry	       *dbg_root;
+	struct dentry	       *dbg_in;
+	struct dentry	       *dbg_out;
+	struct dentry	       *dbg_outlen;
+	struct dentry	       *dbg_status;
+	struct dentry	       *dbg_run;
+	void		       *in_msg;
+	void		       *out_msg;
+	u8			status;
+	u16			inlen;
+	u16			outlen;
+};
+
+struct cache_ent {
+	/* protect block chain allocations
+	 */
+	spinlock_t		lock;
+	struct list_head	head;
+};
+
+struct cmd_msg_cache {
+	struct cache_ent	large;
+	struct cache_ent	med;
+
+};
+
+struct mlx5_cmd_stats {
+	u64		sum;
+	u64		n;
+	struct dentry  *root;
+	struct dentry  *avg;
+	struct dentry  *count;
+	/* protect command average calculations */
+	spinlock_t	lock;
+};
+
+struct mlx5_cmd {
+	void	       *cmd_buf;
+	dma_addr_t	dma;
+	u16		cmdif_rev;
+	u8		log_sz;
+	u8		log_stride;
+	int		max_reg_cmds;
+	int		events;
+	u32 __iomem    *vector;
+
+	/* protect command queue allocations
+	 */
+	spinlock_t	alloc_lock;
+
+	/* protect token allocations
+	 */
+	spinlock_t	token_lock;
+	u8		token;
+	unsigned long	bitmask;
+	char		wq_name[MLX5_CMD_WQ_MAX_NAME];
+	struct workqueue_struct *wq;
+	struct semaphore sem;
+	struct semaphore pages_sem;
+	int	mode;
+	struct mlx5_cmd_work_ent *ent_arr[MLX5_MAX_COMMANDS];
+	struct pci_pool *pool;
+	struct mlx5_cmd_debug dbg;
+	struct cmd_msg_cache cache;
+	int checksum_disabled;
+	struct mlx5_cmd_stats stats[MLX5_CMD_OP_MAX];
+};
+
+struct mlx5_port_caps {
+	int	gid_table_len;
+	int	pkey_table_len;
+};
+
+struct mlx5_caps {
+	u8	log_max_eq;
+	u8	log_max_cq;
+	u8	log_max_qp;
+	u8	log_max_mkey;
+	u8	log_max_pd;
+	u8	log_max_srq;
+	u32	max_cqes;
+	int	max_wqes;
+	int	max_sq_desc_sz;
+	int	max_rq_desc_sz;
+	u64	flags;
+	u16	stat_rate_support;
+	int	log_max_msg;
+	int	num_ports;
+	int	max_ra_res_qp;
+	int	max_ra_req_qp;
+	int	max_srq_wqes;
+	int	bf_reg_size;
+	int	bf_regs_per_page;
+	struct mlx5_port_caps	port[MLX5_MAX_PORTS];
+	u8			ext_port_cap[MLX5_MAX_PORTS];
+	int	max_vf;
+	u32	reserved_lkey;
+	u8	local_ca_ack_delay;
+	u8	log_max_mcg;
+	u16	max_qp_mcg;
+	int	min_page_sz;
+};
+
+struct mlx5_cmd_mailbox {
+	void	       *buf;
+	dma_addr_t	dma;
+	struct mlx5_cmd_mailbox *next;
+};
+
+struct mlx5_buf_list {
+	void		       *buf;
+	dma_addr_t		map;
+};
+
+struct mlx5_buf {
+	struct mlx5_buf_list	direct;
+	struct mlx5_buf_list   *page_list;
+	int			nbufs;
+	int			npages;
+	int			page_shift;
+	int			size;
+};
+
+struct mlx5_eq {
+	struct mlx5_core_dev   *dev;
+	__be32 __iomem	       *doorbell;
+	u32			cons_index;
+	struct mlx5_buf		buf;
+	int			size;
+	u8			irqn;
+	u8			eqn;
+	int			nent;
+	u64			mask;
+	char			name[MLX5_MAX_EQ_NAME];
+	struct list_head	list;
+	int			index;
+	struct mlx5_rsc_debug	*dbg;
+};
+
+
+struct mlx5_core_mr {
+	u64			iova;
+	u64			size;
+	u32			key;
+	u32			pd;
+	u32			access;
+};
+
+struct mlx5_core_srq {
+	u32		srqn;
+	int		max;
+	int		max_gs;
+	int		max_avail_gather;
+	int		wqe_shift;
+	void (*event)	(struct mlx5_core_srq *, enum mlx5_event);
+
+	atomic_t		refcount;
+	struct completion	free;
+};
+
+struct mlx5_eq_table {
+	void __iomem	       *update_ci;
+	void __iomem	       *update_arm_ci;
+	struct list_head       *comp_eq_head;
+	struct mlx5_eq		pages_eq;
+	struct mlx5_eq		async_eq;
+	struct mlx5_eq		cmd_eq;
+	struct msix_entry	*msix_arr;
+	int			num_comp_vectors;
+	/* protect EQs list
+	 */
+	spinlock_t		lock;
+};
+
+struct mlx5_uar {
+	u32			index;
+	struct list_head	bf_list;
+	unsigned		free_bf_bmap;
+	void __iomem	       *wc_map;
+	void __iomem	       *map;
+};
+
+
+struct mlx5_core_health {
+	struct health_buffer __iomem   *health;
+	__be32 __iomem		       *health_counter;
+	struct timer_list		timer;
+	struct list_head		list;
+	u32				prev;
+	int				miss_counter;
+};
+
+struct mlx5_cq_table {
+	/* protect radix tree
+	 */
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+};
+
+struct mlx5_qp_table {
+	/* protect radix tree
+	 */
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+};
+
+struct mlx5_srq_table {
+	/* protect radix tree
+	 */
+	spinlock_t		lock;
+	struct radix_tree_root	tree;
+};
+
+struct mlx5_priv {
+	char			name[MLX5_MAX_NAME_LEN];
+	struct mlx5_eq_table	eq_table;
+	struct mlx5_uuar_info	uuari;
+	MLX5_DECLARE_DOORBELL_LOCK(cq_uar_lock);
+
+	/* pages stuff */
+	struct workqueue_struct *pg_wq;
+	struct rb_root		page_root;
+	int			fw_pages;
+	int			reg_pages;
+
+	struct mlx5_core_health health;
+
+	struct mlx5_srq_table	srq_table;
+
+	/* start: qp staff */
+	struct mlx5_qp_table	qp_table;
+	struct dentry	       *qp_debugfs;
+	struct dentry	       *eq_debugfs;
+	struct dentry	       *cq_debugfs;
+	struct dentry	       *cmdif_debugfs;
+	/* end: qp staff */
+
+	/* start: cq staff */
+	struct mlx5_cq_table	cq_table;
+	/* end: cq staff */
+
+	/* start: alloc staff */
+	struct mutex            pgdir_mutex;
+	struct list_head        pgdir_list;
+	/* end: alloc staff */
+	struct dentry	       *dbg_root;
+
+	/* protect mkey key part */
+	spinlock_t		mkey_lock;
+	u8			mkey_key;
+};
+
+struct mlx5_core_dev {
+	struct pci_dev	       *pdev;
+	u8			rev_id;
+	char			board_id[MLX5_BOARD_ID_LEN];
+	struct mlx5_cmd		cmd;
+	struct mlx5_caps	caps;
+	phys_addr_t		iseg_base;
+	struct mlx5_init_seg __iomem *iseg;
+	void			(*event) (struct mlx5_core_dev *dev,
+					  enum mlx5_dev_event event,
+					  void *data);
+	struct mlx5_priv	priv;
+	struct mlx5_profile	*profile;
+	atomic_t		num_qps;
+};
+
+struct mlx5_db {
+	__be32			*db;
+	union {
+		struct mlx5_db_pgdir		*pgdir;
+		struct mlx5_ib_user_db_page	*user_page;
+	}			u;
+	dma_addr_t		dma;
+	int			index;
+};
+
+enum {
+	MLX5_DB_PER_PAGE = PAGE_SIZE / L1_CACHE_BYTES,
+};
+
+enum {
+	MLX5_COMP_EQ_SIZE = 1024,
+};
+
+struct mlx5_db_pgdir {
+	struct list_head	list;
+	DECLARE_BITMAP(bitmap, MLX5_DB_PER_PAGE);
+	__be32		       *db_page;
+	dma_addr_t		db_dma;
+};
+
+typedef void (*mlx5_cmd_cbk_t)(int status, void *context);
+
+struct mlx5_cmd_work_ent {
+	struct mlx5_cmd_msg    *in;
+	struct mlx5_cmd_msg    *out;
+	mlx5_cmd_cbk_t		callback;
+	void		       *context;
+	int idx;
+	struct completion	done;
+	struct mlx5_cmd        *cmd;
+	struct work_struct	work;
+	struct mlx5_cmd_layout *lay;
+	int			ret;
+	int			page_queue;
+	u8			status;
+	u8			token;
+	struct timespec		ts1;
+	struct timespec		ts2;
+};
+
+struct mlx5_pas {
+	u64	pa;
+	u8	log_sz;
+};
+
+static inline void *mlx5_buf_offset(struct mlx5_buf *buf, int offset)
+{
+	if (likely(BITS_PER_LONG == 64 || buf->nbufs == 1))
+		return buf->direct.buf + offset;
+	else
+		return buf->page_list[offset >> PAGE_SHIFT].buf +
+			(offset & (PAGE_SIZE - 1));
+}
+
+extern struct workqueue_struct *mlx5_core_wq;
+
+#define STRUCT_FIELD(header, field) \
+	.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field),      \
+	.struct_size_bytes   = sizeof((struct ib_unpacked_ ## header *)0)->field
+
+struct ib_field {
+	size_t struct_offset_bytes;
+	size_t struct_size_bytes;
+	int    offset_bits;
+	int    size_bits;
+};
+
+static inline struct mlx5_core_dev *pci2mlx5_core_dev(struct pci_dev *pdev)
+{
+	return pci_get_drvdata(pdev);
+}
+
+extern struct dentry *mlx5_debugfs_root;
+
+static inline u16 fw_rev_maj(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->fw_rev) & 0xffff;
+}
+
+static inline u16 fw_rev_min(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->fw_rev) >> 16;
+}
+
+static inline u16 fw_rev_sub(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->cmdif_rev_fw_sub) & 0xffff;
+}
+
+static inline u16 cmdif_rev(struct mlx5_core_dev *dev)
+{
+	return ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16;
+}
+
+static inline void *mlx5_vzalloc(unsigned long size)
+{
+	void *rtn;
+
+	rtn = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+	if (!rtn)
+		rtn = vzalloc(size);
+	return rtn;
+}
+
+static inline void mlx5_vfree(const void *addr)
+{
+	if (addr && is_vmalloc_addr(addr))
+		vfree(addr);
+	else
+		kfree(addr);
+}
+
+int mlx5_dev_init(struct mlx5_core_dev *dev, struct pci_dev *pdev);
+void mlx5_dev_cleanup(struct mlx5_core_dev *dev);
+int mlx5_cmd_init(struct mlx5_core_dev *dev);
+void mlx5_cmd_cleanup(struct mlx5_core_dev *dev);
+void mlx5_cmd_use_events(struct mlx5_core_dev *dev);
+void mlx5_cmd_use_polling(struct mlx5_core_dev *dev);
+int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr);
+int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out,
+		  int out_size);
+int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
+int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
+int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
+int mlx5_free_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari);
+void mlx5_health_cleanup(void);
+void  __init mlx5_health_init(void);
+void mlx5_start_health_poll(struct mlx5_core_dev *dev);
+void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
+int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, int max_direct,
+		   struct mlx5_buf *buf);
+void mlx5_buf_free(struct mlx5_core_dev *dev, struct mlx5_buf *buf);
+struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev,
+						      gfp_t flags, int npages);
+void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev,
+				 struct mlx5_cmd_mailbox *head);
+int mlx5_core_create_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			 struct mlx5_create_srq_mbox_in *in, int inlen);
+int mlx5_core_destroy_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq);
+int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+			struct mlx5_query_srq_mbox_out *out);
+int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq,
+		      u16 lwm, int is_srq);
+int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
+			  struct mlx5_create_mkey_mbox_in *in, int inlen);
+int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr);
+int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
+			 struct mlx5_query_mkey_mbox_out *out, int outlen);
+int mlx5_core_dump_fill_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr,
+			     u32 *mkey);
+int mlx5_core_alloc_pd(struct mlx5_core_dev *dev, u32 *pdn);
+int mlx5_core_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn);
+int mlx5_core_mad_ifc(struct mlx5_core_dev *dev, void *inb, void *outb,
+		      u16 opmod, int port);
+void mlx5_pagealloc_init(struct mlx5_core_dev *dev);
+void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev);
+int mlx5_pagealloc_start(struct mlx5_core_dev *dev);
+void mlx5_pagealloc_stop(struct mlx5_core_dev *dev);
+void mlx5_core_req_pages_handler(struct mlx5_core_dev *dev, u16 func_id,
+				 s16 npages);
+int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev);
+int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev);
+void mlx5_register_debugfs(void);
+void mlx5_unregister_debugfs(void);
+int mlx5_eq_init(struct mlx5_core_dev *dev);
+void mlx5_eq_cleanup(struct mlx5_core_dev *dev);
+void mlx5_fill_page_array(struct mlx5_buf *buf, __be64 *pas);
+void mlx5_cq_completion(struct mlx5_core_dev *dev, u32 cqn);
+void mlx5_qp_event(struct mlx5_core_dev *dev, u32 qpn, int event_type);
+void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type);
+struct mlx5_core_srq *mlx5_core_get_srq(struct mlx5_core_dev *dev, u32 srqn);
+void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector);
+void mlx5_cq_event(struct mlx5_core_dev *dev, u32 cqn, int event_type);
+int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx,
+		       int nent, u64 mask, const char *name, struct mlx5_uar *uar);
+int mlx5_destroy_unmap_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+int mlx5_start_eqs(struct mlx5_core_dev *dev);
+int mlx5_stop_eqs(struct mlx5_core_dev *dev);
+int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);
+int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn);
+
+int mlx5_qp_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_qp_debugfs_cleanup(struct mlx5_core_dev *dev);
+int mlx5_core_access_reg(struct mlx5_core_dev *dev, void *data_in,
+			 int size_in, void *data_out, int size_out,
+			 u16 reg_num, int arg, int write);
+int mlx5_set_port_caps(struct mlx5_core_dev *dev, int port_num, u32 caps);
+
+int mlx5_debug_eq_add(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+void mlx5_debug_eq_remove(struct mlx5_core_dev *dev, struct mlx5_eq *eq);
+int mlx5_core_eq_query(struct mlx5_core_dev *dev, struct mlx5_eq *eq,
+		       struct mlx5_query_eq_mbox_out *out, int outlen);
+int mlx5_eq_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_eq_debugfs_cleanup(struct mlx5_core_dev *dev);
+int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev);
+int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db);
+void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db);
+
+typedef void (*health_handler_t)(struct pci_dev *pdev, void *buf, int size);
+int mlx5_register_health_report_handler(health_handler_t handler);
+void mlx5_unregister_health_report_handler(void);
+const char *mlx5_command_str(int command);
+int mlx5_cmdif_debugfs_init(struct mlx5_core_dev *dev);
+void mlx5_cmdif_debugfs_cleanup(struct mlx5_core_dev *dev);
+
+static inline u32 mlx5_mkey_to_idx(u32 mkey)
+{
+	return mkey >> 8;
+}
+
+static inline u32 mlx5_idx_to_mkey(u32 mkey_idx)
+{
+	return mkey_idx << 8;
+}
+
+enum {
+	MLX5_PROF_MASK_QP_SIZE		= (u64)1 << 0,
+	MLX5_PROF_MASK_CMDIF_CSUM	= (u64)1 << 1,
+	MLX5_PROF_MASK_MR_CACHE		= (u64)1 << 2,
+};
+
+enum {
+	MAX_MR_CACHE_ENTRIES    = 16,
+};
+
+struct mlx5_profile {
+	u64	mask;
+	u32	log_max_qp;
+	int	cmdif_csum;
+	struct {
+		int	size;
+		int	limit;
+	} mr_cache[MAX_MR_CACHE_ENTRIES];
+};
+
+#endif /* MLX5_DRIVER_H */
diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h
new file mode 100644
index 000000000000..d9e3eacb3a7f
--- /dev/null
+++ b/include/linux/mlx5/qp.h
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_QP_H
+#define MLX5_QP_H
+
+#include <linux/mlx5/device.h>
+#include <linux/mlx5/driver.h>
+
+#define MLX5_INVALID_LKEY	0x100
+
+enum mlx5_qp_optpar {
+	MLX5_QP_OPTPAR_ALT_ADDR_PATH		= 1 << 0,
+	MLX5_QP_OPTPAR_RRE			= 1 << 1,
+	MLX5_QP_OPTPAR_RAE			= 1 << 2,
+	MLX5_QP_OPTPAR_RWE			= 1 << 3,
+	MLX5_QP_OPTPAR_PKEY_INDEX		= 1 << 4,
+	MLX5_QP_OPTPAR_Q_KEY			= 1 << 5,
+	MLX5_QP_OPTPAR_RNR_TIMEOUT		= 1 << 6,
+	MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH	= 1 << 7,
+	MLX5_QP_OPTPAR_SRA_MAX			= 1 << 8,
+	MLX5_QP_OPTPAR_RRA_MAX			= 1 << 9,
+	MLX5_QP_OPTPAR_PM_STATE			= 1 << 10,
+	MLX5_QP_OPTPAR_RETRY_COUNT		= 1 << 12,
+	MLX5_QP_OPTPAR_RNR_RETRY		= 1 << 13,
+	MLX5_QP_OPTPAR_ACK_TIMEOUT		= 1 << 14,
+	MLX5_QP_OPTPAR_PRI_PORT			= 1 << 16,
+	MLX5_QP_OPTPAR_SRQN			= 1 << 18,
+	MLX5_QP_OPTPAR_CQN_RCV			= 1 << 19,
+	MLX5_QP_OPTPAR_DC_HS			= 1 << 20,
+	MLX5_QP_OPTPAR_DC_KEY			= 1 << 21,
+};
+
+enum mlx5_qp_state {
+	MLX5_QP_STATE_RST			= 0,
+	MLX5_QP_STATE_INIT			= 1,
+	MLX5_QP_STATE_RTR			= 2,
+	MLX5_QP_STATE_RTS			= 3,
+	MLX5_QP_STATE_SQER			= 4,
+	MLX5_QP_STATE_SQD			= 5,
+	MLX5_QP_STATE_ERR			= 6,
+	MLX5_QP_STATE_SQ_DRAINING		= 7,
+	MLX5_QP_STATE_SUSPENDED			= 9,
+	MLX5_QP_NUM_STATE
+};
+
+enum {
+	MLX5_QP_ST_RC				= 0x0,
+	MLX5_QP_ST_UC				= 0x1,
+	MLX5_QP_ST_UD				= 0x2,
+	MLX5_QP_ST_XRC				= 0x3,
+	MLX5_QP_ST_MLX				= 0x4,
+	MLX5_QP_ST_DCI				= 0x5,
+	MLX5_QP_ST_DCT				= 0x6,
+	MLX5_QP_ST_QP0				= 0x7,
+	MLX5_QP_ST_QP1				= 0x8,
+	MLX5_QP_ST_RAW_ETHERTYPE		= 0x9,
+	MLX5_QP_ST_RAW_IPV6			= 0xa,
+	MLX5_QP_ST_SNIFFER			= 0xb,
+	MLX5_QP_ST_SYNC_UMR			= 0xe,
+	MLX5_QP_ST_PTP_1588			= 0xd,
+	MLX5_QP_ST_REG_UMR			= 0xc,
+	MLX5_QP_ST_MAX
+};
+
+enum {
+	MLX5_QP_PM_MIGRATED			= 0x3,
+	MLX5_QP_PM_ARMED			= 0x0,
+	MLX5_QP_PM_REARM			= 0x1
+};
+
+enum {
+	MLX5_NON_ZERO_RQ	= 0 << 24,
+	MLX5_SRQ_RQ		= 1 << 24,
+	MLX5_CRQ_RQ		= 2 << 24,
+	MLX5_ZERO_LEN_RQ	= 3 << 24
+};
+
+enum {
+	/* params1 */
+	MLX5_QP_BIT_SRE				= 1 << 15,
+	MLX5_QP_BIT_SWE				= 1 << 14,
+	MLX5_QP_BIT_SAE				= 1 << 13,
+	/* params2 */
+	MLX5_QP_BIT_RRE				= 1 << 15,
+	MLX5_QP_BIT_RWE				= 1 << 14,
+	MLX5_QP_BIT_RAE				= 1 << 13,
+	MLX5_QP_BIT_RIC				= 1 <<	4,
+};
+
+enum {
+	MLX5_WQE_CTRL_CQ_UPDATE		= 2 << 2,
+	MLX5_WQE_CTRL_SOLICITED		= 1 << 1,
+};
+
+enum {
+	MLX5_SEND_WQE_BB	= 64,
+};
+
+enum {
+	MLX5_WQE_FMR_PERM_LOCAL_READ	= 1 << 27,
+	MLX5_WQE_FMR_PERM_LOCAL_WRITE	= 1 << 28,
+	MLX5_WQE_FMR_PERM_REMOTE_READ	= 1 << 29,
+	MLX5_WQE_FMR_PERM_REMOTE_WRITE	= 1 << 30,
+	MLX5_WQE_FMR_PERM_ATOMIC	= 1 << 31
+};
+
+enum {
+	MLX5_FENCE_MODE_NONE			= 0 << 5,
+	MLX5_FENCE_MODE_INITIATOR_SMALL		= 1 << 5,
+	MLX5_FENCE_MODE_STRONG_ORDERING		= 3 << 5,
+	MLX5_FENCE_MODE_SMALL_AND_FENCE		= 4 << 5,
+};
+
+enum {
+	MLX5_QP_LAT_SENSITIVE	= 1 << 28,
+	MLX5_QP_ENABLE_SIG	= 1 << 31,
+};
+
+enum {
+	MLX5_RCV_DBR	= 0,
+	MLX5_SND_DBR	= 1,
+};
+
+struct mlx5_wqe_fmr_seg {
+	__be32			flags;
+	__be32			mem_key;
+	__be64			buf_list;
+	__be64			start_addr;
+	__be64			reg_len;
+	__be32			offset;
+	__be32			page_size;
+	u32			reserved[2];
+};
+
+struct mlx5_wqe_ctrl_seg {
+	__be32			opmod_idx_opcode;
+	__be32			qpn_ds;
+	u8			signature;
+	u8			rsvd[2];
+	u8			fm_ce_se;
+	__be32			imm;
+};
+
+struct mlx5_wqe_xrc_seg {
+	__be32			xrc_srqn;
+	u8			rsvd[12];
+};
+
+struct mlx5_wqe_masked_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+	__be64			swap_add_mask;
+	__be64			compare_mask;
+};
+
+struct mlx5_av {
+	union {
+		struct {
+			__be32	qkey;
+			__be32	reserved;
+		} qkey;
+		__be64	dc_key;
+	} key;
+	__be32	dqp_dct;
+	u8	stat_rate_sl;
+	u8	fl_mlid;
+	__be16	rlid;
+	u8	reserved0[10];
+	u8	tclass;
+	u8	hop_limit;
+	__be32	grh_gid_fl;
+	u8	rgid[16];
+};
+
+struct mlx5_wqe_datagram_seg {
+	struct mlx5_av	av;
+};
+
+struct mlx5_wqe_raddr_seg {
+	__be64			raddr;
+	__be32			rkey;
+	u32			reserved;
+};
+
+struct mlx5_wqe_atomic_seg {
+	__be64			swap_add;
+	__be64			compare;
+};
+
+struct mlx5_wqe_data_seg {
+	__be32			byte_count;
+	__be32			lkey;
+	__be64			addr;
+};
+
+struct mlx5_wqe_umr_ctrl_seg {
+	u8		flags;
+	u8		rsvd0[3];
+	__be16		klm_octowords;
+	__be16		bsf_octowords;
+	__be64		mkey_mask;
+	u8		rsvd1[32];
+};
+
+struct mlx5_seg_set_psv {
+	__be32		psv_num;
+	__be16		syndrome;
+	__be16		status;
+	__be32		transient_sig;
+	__be32		ref_tag;
+};
+
+struct mlx5_seg_get_psv {
+	u8		rsvd[19];
+	u8		num_psv;
+	__be32		l_key;
+	__be64		va;
+	__be32		psv_index[4];
+};
+
+struct mlx5_seg_check_psv {
+	u8		rsvd0[2];
+	__be16		err_coalescing_op;
+	u8		rsvd1[2];
+	__be16		xport_err_op;
+	u8		rsvd2[2];
+	__be16		xport_err_mask;
+	u8		rsvd3[7];
+	u8		num_psv;
+	__be32		l_key;
+	__be64		va;
+	__be32		psv_index[4];
+};
+
+struct mlx5_rwqe_sig {
+	u8	rsvd0[4];
+	u8	signature;
+	u8	rsvd1[11];
+};
+
+struct mlx5_wqe_signature_seg {
+	u8	rsvd0[4];
+	u8	signature;
+	u8	rsvd1[11];
+};
+
+struct mlx5_wqe_inline_seg {
+	__be32	byte_count;
+};
+
+struct mlx5_core_qp {
+	void (*event)		(struct mlx5_core_qp *, int);
+	int			qpn;
+	atomic_t		refcount;
+	struct completion	free;
+	struct mlx5_rsc_debug	*dbg;
+	int			pid;
+};
+
+struct mlx5_qp_path {
+	u8			fl;
+	u8			rsvd3;
+	u8			free_ar;
+	u8			pkey_index;
+	u8			rsvd0;
+	u8			grh_mlid;
+	__be16			rlid;
+	u8			ackto_lt;
+	u8			mgid_index;
+	u8			static_rate;
+	u8			hop_limit;
+	__be32			tclass_flowlabel;
+	u8			rgid[16];
+	u8			rsvd1[4];
+	u8			sl;
+	u8			port;
+	u8			rsvd2[6];
+};
+
+struct mlx5_qp_context {
+	__be32			flags;
+	__be32			flags_pd;
+	u8			mtu_msgmax;
+	u8			rq_size_stride;
+	__be16			sq_crq_size;
+	__be32			qp_counter_set_usr_page;
+	__be32			wire_qpn;
+	__be32			log_pg_sz_remote_qpn;
+	struct			mlx5_qp_path pri_path;
+	struct			mlx5_qp_path alt_path;
+	__be32			params1;
+	u8			reserved2[4];
+	__be32			next_send_psn;
+	__be32			cqn_send;
+	u8			reserved3[8];
+	__be32			last_acked_psn;
+	__be32			ssn;
+	__be32			params2;
+	__be32			rnr_nextrecvpsn;
+	__be32			xrcd;
+	__be32			cqn_recv;
+	__be64			db_rec_addr;
+	__be32			qkey;
+	__be32			rq_type_srqn;
+	__be32			rmsn;
+	__be16			hw_sq_wqe_counter;
+	__be16			sw_sq_wqe_counter;
+	__be16			hw_rcyclic_byte_counter;
+	__be16			hw_rq_counter;
+	__be16			sw_rcyclic_byte_counter;
+	__be16			sw_rq_counter;
+	u8			rsvd0[5];
+	u8			cgs;
+	u8			cs_req;
+	u8			cs_res;
+	__be64			dc_access_key;
+	u8			rsvd1[24];
+};
+
+struct mlx5_create_qp_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			input_qpn;
+	u8			rsvd0[4];
+	__be32			opt_param_mask;
+	u8			rsvd1[4];
+	struct mlx5_qp_context	ctx;
+	u8			rsvd3[16];
+	__be64			pas[0];
+};
+
+struct mlx5_create_qp_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			qpn;
+	u8			rsvd0[4];
+};
+
+struct mlx5_destroy_qp_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			qpn;
+	u8			rsvd0[4];
+};
+
+struct mlx5_destroy_qp_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[8];
+};
+
+struct mlx5_modify_qp_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			qpn;
+	u8			rsvd1[4];
+	__be32			optparam;
+	u8			rsvd0[4];
+	struct mlx5_qp_context	ctx;
+};
+
+struct mlx5_modify_qp_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd0[8];
+};
+
+struct mlx5_query_qp_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			qpn;
+	u8			rsvd[4];
+};
+
+struct mlx5_query_qp_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd1[8];
+	__be32			optparam;
+	u8			rsvd0[4];
+	struct mlx5_qp_context	ctx;
+	u8			rsvd2[16];
+	__be64			pas[0];
+};
+
+struct mlx5_conf_sqp_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			qpn;
+	u8			rsvd[3];
+	u8			type;
+};
+
+struct mlx5_conf_sqp_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_alloc_xrcd_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+struct mlx5_alloc_xrcd_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	__be32			xrcdn;
+	u8			rsvd[4];
+};
+
+struct mlx5_dealloc_xrcd_mbox_in {
+	struct mlx5_inbox_hdr	hdr;
+	__be32			xrcdn;
+	u8			rsvd[4];
+};
+
+struct mlx5_dealloc_xrcd_mbox_out {
+	struct mlx5_outbox_hdr	hdr;
+	u8			rsvd[8];
+};
+
+static inline struct mlx5_core_qp *__mlx5_qp_lookup(struct mlx5_core_dev *dev, u32 qpn)
+{
+	return radix_tree_lookup(&dev->priv.qp_table.tree, qpn);
+}
+
+int mlx5_core_create_qp(struct mlx5_core_dev *dev,
+			struct mlx5_core_qp *qp,
+			struct mlx5_create_qp_mbox_in *in,
+			int inlen);
+int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state,
+			enum mlx5_qp_state new_state,
+			struct mlx5_modify_qp_mbox_in *in, int sqd_event,
+			struct mlx5_core_qp *qp);
+int mlx5_core_destroy_qp(struct mlx5_core_dev *dev,
+			 struct mlx5_core_qp *qp);
+int mlx5_core_qp_query(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp,
+		       struct mlx5_query_qp_mbox_out *out, int outlen);
+
+int mlx5_core_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn);
+int mlx5_core_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn);
+void mlx5_init_qp_table(struct mlx5_core_dev *dev);
+void mlx5_cleanup_qp_table(struct mlx5_core_dev *dev);
+int mlx5_debug_qp_add(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
+void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp);
+
+#endif /* MLX5_QP_H */
diff --git a/include/linux/mlx5/srq.h b/include/linux/mlx5/srq.h
new file mode 100644
index 000000000000..e1a363a33663
--- /dev/null
+++ b/include/linux/mlx5/srq.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2013, Mellanox Technologies inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_SRQ_H
+#define MLX5_SRQ_H
+
+#include <linux/mlx5/driver.h>
+
+void mlx5_init_srq_table(struct mlx5_core_dev *dev);
+void mlx5_cleanup_srq_table(struct mlx5_core_dev *dev);
+
+#endif /* MLX5_SRQ_H */
-- 
cgit v1.2.3


From 63884c90ffa3f73a81b81f169c51c34d2b9cf75e Mon Sep 17 00:00:00 2001
From: Roland Dreier <roland@purestorage.com>
Date: Mon, 1 Jul 2013 14:15:17 -0700
Subject: mlx5: Fix parameter type of health_handler_t

This deals with the sparse warning:

    drivers/net/ethernet/mellanox/mlx5/core/health.c:94:54: warning: incorrect type in argument 2 (different address spaces)
    drivers/net/ethernet/mellanox/mlx5/core/health.c:94:54:    expected void *buf
    drivers/net/ethernet/mellanox/mlx5/core/health.c:94:54:    got struct health_buffer [noderef] <asn:2>*health

Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 include/linux/mlx5/driver.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index e47f1e4c9b03..f22e4419839b 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -729,7 +729,7 @@ void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev);
 int mlx5_db_alloc(struct mlx5_core_dev *dev, struct mlx5_db *db);
 void mlx5_db_free(struct mlx5_core_dev *dev, struct mlx5_db *db);
 
-typedef void (*health_handler_t)(struct pci_dev *pdev, void *buf, int size);
+typedef void (*health_handler_t)(struct pci_dev *pdev, struct health_buffer __iomem *buf, int size);
 int mlx5_register_health_report_handler(health_handler_t handler);
 void mlx5_unregister_health_report_handler(void);
 const char *mlx5_command_str(int command);
-- 
cgit v1.2.3


From 8b80cda536ea9bceec0364e897868a30ee13b992 Mon Sep 17 00:00:00 2001
From: Eliezer Tamir <eliezer.tamir@linux.intel.com>
Date: Wed, 10 Jul 2013 17:13:26 +0300
Subject: net: rename ll methods to busy-poll

Rename ndo_ll_poll to ndo_busy_poll.
Rename sk_mark_ll to sk_mark_napi_id.
Rename skb_mark_ll to skb_mark_napi_id.
Correct all useres of these functions.
Update comments and defines  in include/net/busy_poll.h

Signed-off-by: Eliezer Tamir <eliezer.tamir@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c  |  2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c |  2 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c    |  4 ++--
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c   |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c       |  2 +-
 include/linux/netdevice.h                        |  2 +-
 include/net/busy_poll.h                          | 22 ++++++++++++----------
 net/ipv4/tcp_ipv4.c                              |  2 +-
 net/ipv4/udp.c                                   |  2 +-
 net/ipv6/tcp_ipv6.c                              |  2 +-
 net/ipv6/udp.c                                   |  2 +-
 11 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 05b6b4e8b073..3353efe79194 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -990,7 +990,7 @@ reuse_rx:
 			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
 					       le16_to_cpu(cqe_fp->vlan_tag));
 
-		skb_mark_ll(skb, &fp->napi);
+		skb_mark_napi_id(skb, &fp->napi);
 
 		if (bnx2x_fp_ll_polling(fp))
 			netif_receive_skb(skb);
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index 15a528bda87c..e5da07858a2f 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -12027,7 +12027,7 @@ static const struct net_device_ops bnx2x_netdev_ops = {
 #endif
 
 #ifdef CONFIG_NET_LL_RX_POLL
-	.ndo_ll_poll		= bnx2x_low_latency_recv,
+	.ndo_busy_poll		= bnx2x_low_latency_recv,
 #endif
 };
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 047ebaaf0141..bad8f14b1941 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1978,7 +1978,7 @@ static int ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		}
 
 #endif /* IXGBE_FCOE */
-		skb_mark_ll(skb, &q_vector->napi);
+		skb_mark_napi_id(skb, &q_vector->napi);
 		ixgbe_rx_skb(q_vector, skb);
 
 		/* update budget accounting */
@@ -7228,7 +7228,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_poll_controller	= ixgbe_netpoll,
 #endif
 #ifdef CONFIG_NET_LL_RX_POLL
-	.ndo_ll_poll		= ixgbe_low_latency_recv,
+	.ndo_busy_poll		= ixgbe_low_latency_recv,
 #endif
 #ifdef IXGBE_FCOE
 	.ndo_fcoe_ddp_setup = ixgbe_fcoe_ddp_get,
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 0fb2438dc2c7..5eac871399d8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2141,7 +2141,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
 #endif
 #ifdef CONFIG_NET_LL_RX_POLL
-	.ndo_ll_poll		= mlx4_en_low_latency_recv,
+	.ndo_busy_poll		= mlx4_en_low_latency_recv,
 #endif
 };
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 90746d37ac9b..dec455c8f627 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -767,7 +767,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 					       timestamp);
 		}
 
-		skb_mark_ll(skb, &cq->napi);
+		skb_mark_napi_id(skb, &cq->napi);
 
 		/* Push it up the stack */
 		netif_receive_skb(skb);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bb82871b8494..0741a1e919a5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -974,7 +974,7 @@ struct net_device_ops {
 	void			(*ndo_netpoll_cleanup)(struct net_device *dev);
 #endif
 #ifdef CONFIG_NET_LL_RX_POLL
-	int			(*ndo_ll_poll)(struct napi_struct *dev);
+	int			(*ndo_busy_poll)(struct napi_struct *dev);
 #endif
 	int			(*ndo_set_vf_mac)(struct net_device *dev,
 						  int queue, u8 *mac);
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
index 76f034087743..4ff71908fd42 100644
--- a/include/net/busy_poll.h
+++ b/include/net/busy_poll.h
@@ -1,5 +1,5 @@
 /*
- * Low Latency Sockets
+ * net busy poll support
  * Copyright(c) 2013 Intel Corporation.
  *
  * This program is free software; you can redistribute it and/or modify it
@@ -21,8 +21,8 @@
  * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
  */
 
-#ifndef _LINUX_NET_LL_POLL_H
-#define _LINUX_NET_LL_POLL_H
+#ifndef _LINUX_NET_BUSY_POLL_H
+#define _LINUX_NET_BUSY_POLL_H
 
 #include <linux/netdevice.h>
 #include <net/ip.h>
@@ -110,11 +110,11 @@ static inline bool sk_busy_loop(struct sock *sk, int nonblock)
 		goto out;
 
 	ops = napi->dev->netdev_ops;
-	if (!ops->ndo_ll_poll)
+	if (!ops->ndo_busy_poll)
 		goto out;
 
 	do {
-		rc = ops->ndo_ll_poll(napi);
+		rc = ops->ndo_busy_poll(napi);
 
 		if (rc == LL_FLUSH_FAILED)
 			break; /* permanent failure */
@@ -134,13 +134,14 @@ out:
 }
 
 /* used in the NIC receive handler to mark the skb */
-static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
+static inline void skb_mark_napi_id(struct sk_buff *skb,
+				    struct napi_struct *napi)
 {
 	skb->napi_id = napi->napi_id;
 }
 
 /* used in the protocol hanlder to propagate the napi_id to the socket */
-static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
+static inline void sk_mark_napi_id(struct sock *sk, struct sk_buff *skb)
 {
 	sk->sk_napi_id = skb->napi_id;
 }
@@ -166,11 +167,12 @@ static inline bool sk_busy_poll(struct sock *sk, int nonblock)
 	return false;
 }
 
-static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
+static inline void skb_mark_napi_id(struct sk_buff *skb,
+				    struct napi_struct *napi)
 {
 }
 
-static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
+static inline void sk_mark_napi_id(struct sock *sk, struct sk_buff *skb)
 {
 }
 
@@ -180,4 +182,4 @@ static inline bool busy_loop_timeout(unsigned long end_time)
 }
 
 #endif /* CONFIG_NET_LL_RX_POLL */
-#endif /* _LINUX_NET_LL_POLL_H */
+#endif /* _LINUX_NET_BUSY_POLL_H */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3a261b41a00c..b299da5ff499 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1994,7 +1994,7 @@ process:
 	if (sk_filter(sk, skb))
 		goto discard_and_relse;
 
-	sk_mark_ll(sk, skb);
+	sk_mark_napi_id(sk, skb);
 	skb->dev = NULL;
 
 	bh_lock_sock_nested(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index bcc0ff2c16da..a0d7151ffbd9 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1713,7 +1713,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	if (sk != NULL) {
 		int ret;
 
-		sk_mark_ll(sk, skb);
+		sk_mark_napi_id(sk, skb);
 		ret = udp_queue_rcv_skb(sk, skb);
 		sock_put(sk);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 345bd92d4ddb..6e1649d58533 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1499,7 +1499,7 @@ process:
 	if (sk_filter(sk, skb))
 		goto discard_and_relse;
 
-	sk_mark_ll(sk, skb);
+	sk_mark_napi_id(sk, skb);
 	skb->dev = NULL;
 
 	bh_lock_sock_nested(sk);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 40e72034da07..f4058150262b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -844,7 +844,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
 	if (sk != NULL) {
 		int ret;
 
-		sk_mark_ll(sk, skb);
+		sk_mark_napi_id(sk, skb);
 		ret = udpv6_queue_rcv_skb(sk, skb);
 		sock_put(sk);
 
-- 
cgit v1.2.3


From 288dde9f23b6726c1e8147bf635721372bf77b16 Mon Sep 17 00:00:00 2001
From: Moshe Lazer <moshel@mellanox.com>
Date: Wed, 10 Jul 2013 14:31:03 +0300
Subject: mlx5_core: Adjust hca_cap.uar_page_sz to conform to Connect-IB spec

Sparse reported an endianness bug in the assignment to hca_cap.uar_page_sz.

Fix the declaration of this field to be __be16 (which is what is in
the firmware spec), renaming the field to log_uar_pg_size to conform
to the spec, which fixes the endianness bug reported by sparse.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Moshe Lazer <moshel@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +-
 include/linux/mlx5/device.h                    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index f21cc397d1bc..12242de2b0e3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -212,7 +212,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev)
 		set_ctx->hca_cap.log_max_qp = dev->profile->log_max_qp;
 
 	memset(&set_out, 0, sizeof(set_out));
-	set_ctx->hca_cap.uar_page_sz = cpu_to_be16(PAGE_SHIFT - 12);
+	set_ctx->hca_cap.log_uar_page_sz = cpu_to_be16(PAGE_SHIFT - 12);
 	set_ctx->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_SET_HCA_CAP);
 	err = mlx5_cmd_exec(dev, set_ctx, sizeof(*set_ctx),
 				 &set_out, sizeof(set_out));
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 51390915e538..8de8d8f22384 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -317,8 +317,8 @@ struct mlx5_hca_cap {
 	u8	log_max_pd;
 	u8	rsvd25;
 	u8	log_max_xrcd;
-	u8	rsvd26[40];
-	__be32  uar_page_sz;
+	u8	rsvd26[42];
+	__be16  log_uar_page_sz;
 	u8	rsvd27[28];
 	u8	log_msx_atomic_size_qp;
 	u8	rsvd28[2];
-- 
cgit v1.2.3


From 1b375dc30710180c4b88cc59caba6e3481ec5c8b Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <maarten.lankhorst@canonical.com>
Date: Fri, 5 Jul 2013 09:29:32 +0200
Subject: mutex: Move ww_mutex definitions to ww_mutex.h

Move the definitions for wound/wait mutexes out to a separate
header, ww_mutex.h. This reduces clutter in mutex.h, and
increases readability.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Maarten Lankhorst <maarten.lankhorst@canonical.com>
Cc: Dave Airlie <airlied@gmail.com>
Link: http://lkml.kernel.org/r/51D675DC.3000907@canonical.com
[ Tidied up the code a bit. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h       | 358 -----------------------------------------
 include/linux/reservation.h |   2 +-
 include/linux/ww_mutex.h    | 378 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/mutex.c              |   1 +
 lib/locking-selftest.c      |   1 +
 5 files changed, 381 insertions(+), 359 deletions(-)
 create mode 100644 include/linux/ww_mutex.h

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3793ed7feeeb..ccd4260834c5 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -78,40 +78,6 @@ struct mutex_waiter {
 #endif
 };
 
-struct ww_class {
-	atomic_long_t stamp;
-	struct lock_class_key acquire_key;
-	struct lock_class_key mutex_key;
-	const char *acquire_name;
-	const char *mutex_name;
-};
-
-struct ww_acquire_ctx {
-	struct task_struct *task;
-	unsigned long stamp;
-	unsigned acquired;
-#ifdef CONFIG_DEBUG_MUTEXES
-	unsigned done_acquire;
-	struct ww_class *ww_class;
-	struct ww_mutex *contending_lock;
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map dep_map;
-#endif
-#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
-	unsigned deadlock_inject_interval;
-	unsigned deadlock_inject_countdown;
-#endif
-};
-
-struct ww_mutex {
-	struct mutex base;
-	struct ww_acquire_ctx *ctx;
-#ifdef CONFIG_DEBUG_MUTEXES
-	struct ww_class *ww_class;
-#endif
-};
-
 #ifdef CONFIG_DEBUG_MUTEXES
 # include <linux/mutex-debug.h>
 #else
@@ -136,11 +102,8 @@ static inline void mutex_destroy(struct mutex *lock) {}
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __DEP_MAP_MUTEX_INITIALIZER(lockname) \
 		, .dep_map = { .name = #lockname }
-# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class) \
-		, .ww_class = &ww_class
 #else
 # define __DEP_MAP_MUTEX_INITIALIZER(lockname)
-# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class)
 #endif
 
 #define __MUTEX_INITIALIZER(lockname) \
@@ -150,48 +113,12 @@ static inline void mutex_destroy(struct mutex *lock) {}
 		__DEBUG_MUTEX_INITIALIZER(lockname) \
 		__DEP_MAP_MUTEX_INITIALIZER(lockname) }
 
-#define __WW_CLASS_INITIALIZER(ww_class) \
-		{ .stamp = ATOMIC_LONG_INIT(0) \
-		, .acquire_name = #ww_class "_acquire" \
-		, .mutex_name = #ww_class "_mutex" }
-
-#define __WW_MUTEX_INITIALIZER(lockname, class) \
-		{ .base = { \__MUTEX_INITIALIZER(lockname) } \
-		__WW_CLASS_MUTEX_INITIALIZER(lockname, class) }
-
 #define DEFINE_MUTEX(mutexname) \
 	struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)
 
-#define DEFINE_WW_CLASS(classname) \
-	struct ww_class classname = __WW_CLASS_INITIALIZER(classname)
-
-#define DEFINE_WW_MUTEX(mutexname, ww_class) \
-	struct ww_mutex mutexname = __WW_MUTEX_INITIALIZER(mutexname, ww_class)
-
-
 extern void __mutex_init(struct mutex *lock, const char *name,
 			 struct lock_class_key *key);
 
-/**
- * ww_mutex_init - initialize the w/w mutex
- * @lock: the mutex to be initialized
- * @ww_class: the w/w class the mutex should belong to
- *
- * Initialize the w/w mutex to unlocked state and associate it with the given
- * class.
- *
- * It is not allowed to initialize an already locked mutex.
- */
-static inline void ww_mutex_init(struct ww_mutex *lock,
-				 struct ww_class *ww_class)
-{
-	__mutex_init(&lock->base, ww_class->mutex_name, &ww_class->mutex_key);
-	lock->ctx = NULL;
-#ifdef CONFIG_DEBUG_MUTEXES
-	lock->ww_class = ww_class;
-#endif
-}
-
 /**
  * mutex_is_locked - is the mutex locked
  * @lock: the mutex to be queried
@@ -246,291 +173,6 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
 
-/**
- * ww_acquire_init - initialize a w/w acquire context
- * @ctx: w/w acquire context to initialize
- * @ww_class: w/w class of the context
- *
- * Initializes an context to acquire multiple mutexes of the given w/w class.
- *
- * Context-based w/w mutex acquiring can be done in any order whatsoever within
- * a given lock class. Deadlocks will be detected and handled with the
- * wait/wound logic.
- *
- * Mixing of context-based w/w mutex acquiring and single w/w mutex locking can
- * result in undetected deadlocks and is so forbidden. Mixing different contexts
- * for the same w/w class when acquiring mutexes can also result in undetected
- * deadlocks, and is hence also forbidden. Both types of abuse will be caught by
- * enabling CONFIG_PROVE_LOCKING.
- *
- * Nesting of acquire contexts for _different_ w/w classes is possible, subject
- * to the usual locking rules between different lock classes.
- *
- * An acquire context must be released with ww_acquire_fini by the same task
- * before the memory is freed. It is recommended to allocate the context itself
- * on the stack.
- */
-static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
-				   struct ww_class *ww_class)
-{
-	ctx->task = current;
-	ctx->stamp = atomic_long_inc_return(&ww_class->stamp);
-	ctx->acquired = 0;
-#ifdef CONFIG_DEBUG_MUTEXES
-	ctx->ww_class = ww_class;
-	ctx->done_acquire = 0;
-	ctx->contending_lock = NULL;
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	debug_check_no_locks_freed((void *)ctx, sizeof(*ctx));
-	lockdep_init_map(&ctx->dep_map, ww_class->acquire_name,
-			 &ww_class->acquire_key, 0);
-	mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_);
-#endif
-#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
-	ctx->deadlock_inject_interval = 1;
-	ctx->deadlock_inject_countdown = ctx->stamp & 0xf;
-#endif
-}
-
-/**
- * ww_acquire_done - marks the end of the acquire phase
- * @ctx: the acquire context
- *
- * Marks the end of the acquire phase, any further w/w mutex lock calls using
- * this context are forbidden.
- *
- * Calling this function is optional, it is just useful to document w/w mutex
- * code and clearly designated the acquire phase from actually using the locked
- * data structures.
- */
-static inline void ww_acquire_done(struct ww_acquire_ctx *ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
-	lockdep_assert_held(ctx);
-
-	DEBUG_LOCKS_WARN_ON(ctx->done_acquire);
-	ctx->done_acquire = 1;
-#endif
-}
-
-/**
- * ww_acquire_fini - releases a w/w acquire context
- * @ctx: the acquire context to free
- *
- * Releases a w/w acquire context. This must be called _after_ all acquired w/w
- * mutexes have been released with ww_mutex_unlock.
- */
-static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
-	mutex_release(&ctx->dep_map, 0, _THIS_IP_);
-
-	DEBUG_LOCKS_WARN_ON(ctx->acquired);
-	if (!config_enabled(CONFIG_PROVE_LOCKING))
-		/*
-		 * lockdep will normally handle this,
-		 * but fail without anyway
-		 */
-		ctx->done_acquire = 1;
-
-	if (!config_enabled(CONFIG_DEBUG_LOCK_ALLOC))
-		/* ensure ww_acquire_fini will still fail if called twice */
-		ctx->acquired = ~0U;
-#endif
-}
-
-extern int __must_check __ww_mutex_lock(struct ww_mutex *lock,
-					struct ww_acquire_ctx *ctx);
-extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
-						      struct ww_acquire_ctx *ctx);
-
-/**
- * ww_mutex_lock - acquire the w/w mutex
- * @lock: the mutex to be acquired
- * @ctx: w/w acquire context, or NULL to acquire only a single lock.
- *
- * Lock the w/w mutex exclusively for this task.
- *
- * Deadlocks within a given w/w class of locks are detected and handled with the
- * wait/wound algorithm. If the lock isn't immediately avaiable this function
- * will either sleep until it is (wait case). Or it selects the current context
- * for backing off by returning -EDEADLK (wound case). Trying to acquire the
- * same lock with the same context twice is also detected and signalled by
- * returning -EALREADY. Returns 0 if the mutex was successfully acquired.
- *
- * In the wound case the caller must release all currently held w/w mutexes for
- * the given context and then wait for this contending lock to be available by
- * calling ww_mutex_lock_slow. Alternatively callers can opt to not acquire this
- * lock and proceed with trying to acquire further w/w mutexes (e.g. when
- * scanning through lru lists trying to free resources).
- *
- * The mutex must later on be released by the same task that
- * acquired it. The task may not exit without first unlocking the mutex. Also,
- * kernel memory where the mutex resides must not be freed with the mutex still
- * locked. The mutex must first be initialized (or statically defined) before it
- * can be locked. memset()-ing the mutex to 0 is not allowed. The mutex must be
- * of the same w/w lock class as was used to initialize the acquire context.
- *
- * A mutex acquired with this function must be released with ww_mutex_unlock.
- */
-static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
-{
-	if (ctx)
-		return __ww_mutex_lock(lock, ctx);
-	else {
-		mutex_lock(&lock->base);
-		return 0;
-	}
-}
-
-/**
- * ww_mutex_lock_interruptible - acquire the w/w mutex, interruptible
- * @lock: the mutex to be acquired
- * @ctx: w/w acquire context
- *
- * Lock the w/w mutex exclusively for this task.
- *
- * Deadlocks within a given w/w class of locks are detected and handled with the
- * wait/wound algorithm. If the lock isn't immediately avaiable this function
- * will either sleep until it is (wait case). Or it selects the current context
- * for backing off by returning -EDEADLK (wound case). Trying to acquire the
- * same lock with the same context twice is also detected and signalled by
- * returning -EALREADY. Returns 0 if the mutex was successfully acquired. If a
- * signal arrives while waiting for the lock then this function returns -EINTR.
- *
- * In the wound case the caller must release all currently held w/w mutexes for
- * the given context and then wait for this contending lock to be available by
- * calling ww_mutex_lock_slow_interruptible. Alternatively callers can opt to
- * not acquire this lock and proceed with trying to acquire further w/w mutexes
- * (e.g. when scanning through lru lists trying to free resources).
- *
- * The mutex must later on be released by the same task that
- * acquired it. The task may not exit without first unlocking the mutex. Also,
- * kernel memory where the mutex resides must not be freed with the mutex still
- * locked. The mutex must first be initialized (or statically defined) before it
- * can be locked. memset()-ing the mutex to 0 is not allowed. The mutex must be
- * of the same w/w lock class as was used to initialize the acquire context.
- *
- * A mutex acquired with this function must be released with ww_mutex_unlock.
- */
-static inline int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
-							   struct ww_acquire_ctx *ctx)
-{
-	if (ctx)
-		return __ww_mutex_lock_interruptible(lock, ctx);
-	else
-		return mutex_lock_interruptible(&lock->base);
-}
-
-/**
- * ww_mutex_lock_slow - slowpath acquiring of the w/w mutex
- * @lock: the mutex to be acquired
- * @ctx: w/w acquire context
- *
- * Acquires a w/w mutex with the given context after a wound case. This function
- * will sleep until the lock becomes available.
- *
- * The caller must have released all w/w mutexes already acquired with the
- * context and then call this function on the contended lock.
- *
- * Afterwards the caller may continue to (re)acquire the other w/w mutexes it
- * needs with ww_mutex_lock. Note that the -EALREADY return code from
- * ww_mutex_lock can be used to avoid locking this contended mutex twice.
- *
- * It is forbidden to call this function with any other w/w mutexes associated
- * with the context held. It is forbidden to call this on anything else than the
- * contending mutex.
- *
- * Note that the slowpath lock acquiring can also be done by calling
- * ww_mutex_lock directly. This function here is simply to help w/w mutex
- * locking code readability by clearly denoting the slowpath.
- */
-static inline void
-ww_mutex_lock_slow(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
-{
-	int ret;
-#ifdef CONFIG_DEBUG_MUTEXES
-	DEBUG_LOCKS_WARN_ON(!ctx->contending_lock);
-#endif
-	ret = ww_mutex_lock(lock, ctx);
-	(void)ret;
-}
-
-/**
- * ww_mutex_lock_slow_interruptible - slowpath acquiring of the w/w mutex,
- * 				      interruptible
- * @lock: the mutex to be acquired
- * @ctx: w/w acquire context
- *
- * Acquires a w/w mutex with the given context after a wound case. This function
- * will sleep until the lock becomes available and returns 0 when the lock has
- * been acquired. If a signal arrives while waiting for the lock then this
- * function returns -EINTR.
- *
- * The caller must have released all w/w mutexes already acquired with the
- * context and then call this function on the contended lock.
- *
- * Afterwards the caller may continue to (re)acquire the other w/w mutexes it
- * needs with ww_mutex_lock. Note that the -EALREADY return code from
- * ww_mutex_lock can be used to avoid locking this contended mutex twice.
- *
- * It is forbidden to call this function with any other w/w mutexes associated
- * with the given context held. It is forbidden to call this on anything else
- * than the contending mutex.
- *
- * Note that the slowpath lock acquiring can also be done by calling
- * ww_mutex_lock_interruptible directly. This function here is simply to help
- * w/w mutex locking code readability by clearly denoting the slowpath.
- */
-static inline int __must_check
-ww_mutex_lock_slow_interruptible(struct ww_mutex *lock,
-				 struct ww_acquire_ctx *ctx)
-{
-#ifdef CONFIG_DEBUG_MUTEXES
-	DEBUG_LOCKS_WARN_ON(!ctx->contending_lock);
-#endif
-	return ww_mutex_lock_interruptible(lock, ctx);
-}
-
-extern void ww_mutex_unlock(struct ww_mutex *lock);
-
-/**
- * ww_mutex_trylock - tries to acquire the w/w mutex without acquire context
- * @lock: mutex to lock
- *
- * Trylocks a mutex without acquire context, so no deadlock detection is
- * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise.
- */
-static inline int __must_check ww_mutex_trylock(struct ww_mutex *lock)
-{
-	return mutex_trylock(&lock->base);
-}
-
-/***
- * ww_mutex_destroy - mark a w/w mutex unusable
- * @lock: the mutex to be destroyed
- *
- * This function marks the mutex uninitialized, and any subsequent
- * use of the mutex is forbidden. The mutex must not be locked when
- * this function is called.
- */
-static inline void ww_mutex_destroy(struct ww_mutex *lock)
-{
-	mutex_destroy(&lock->base);
-}
-
-/**
- * ww_mutex_is_locked - is the w/w mutex locked
- * @lock: the mutex to be queried
- *
- * Returns 1 if the mutex is locked, 0 if unlocked.
- */
-static inline bool ww_mutex_is_locked(struct ww_mutex *lock)
-{
-	return mutex_is_locked(&lock->base);
-}
-
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 #ifndef CONFIG_HAVE_ARCH_MUTEX_CPU_RELAX
diff --git a/include/linux/reservation.h b/include/linux/reservation.h
index e9ee806a9d72..813dae960ebd 100644
--- a/include/linux/reservation.h
+++ b/include/linux/reservation.h
@@ -39,7 +39,7 @@
 #ifndef _LINUX_RESERVATION_H
 #define _LINUX_RESERVATION_H
 
-#include <linux/mutex.h>
+#include <linux/ww_mutex.h>
 
 extern struct ww_class reservation_ww_class;
 
diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
new file mode 100644
index 000000000000..760399a470bd
--- /dev/null
+++ b/include/linux/ww_mutex.h
@@ -0,0 +1,378 @@
+/*
+ * Wound/Wait Mutexes: blocking mutual exclusion locks with deadlock avoidance
+ *
+ * Original mutex implementation started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Wound/wait implementation:
+ *  Copyright (C) 2013 Canonical Ltd.
+ *
+ * This file contains the main data structure and API definitions.
+ */
+
+#ifndef __LINUX_WW_MUTEX_H
+#define __LINUX_WW_MUTEX_H
+
+#include <linux/mutex.h>
+
+struct ww_class {
+	atomic_long_t stamp;
+	struct lock_class_key acquire_key;
+	struct lock_class_key mutex_key;
+	const char *acquire_name;
+	const char *mutex_name;
+};
+
+struct ww_acquire_ctx {
+	struct task_struct *task;
+	unsigned long stamp;
+	unsigned acquired;
+#ifdef CONFIG_DEBUG_MUTEXES
+	unsigned done_acquire;
+	struct ww_class *ww_class;
+	struct ww_mutex *contending_lock;
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map dep_map;
+#endif
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+	unsigned deadlock_inject_interval;
+	unsigned deadlock_inject_countdown;
+#endif
+};
+
+struct ww_mutex {
+	struct mutex base;
+	struct ww_acquire_ctx *ctx;
+#ifdef CONFIG_DEBUG_MUTEXES
+	struct ww_class *ww_class;
+#endif
+};
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class) \
+		, .ww_class = &ww_class
+#else
+# define __WW_CLASS_MUTEX_INITIALIZER(lockname, ww_class)
+#endif
+
+#define __WW_CLASS_INITIALIZER(ww_class) \
+		{ .stamp = ATOMIC_LONG_INIT(0) \
+		, .acquire_name = #ww_class "_acquire" \
+		, .mutex_name = #ww_class "_mutex" }
+
+#define __WW_MUTEX_INITIALIZER(lockname, class) \
+		{ .base = { \__MUTEX_INITIALIZER(lockname) } \
+		__WW_CLASS_MUTEX_INITIALIZER(lockname, class) }
+
+#define DEFINE_WW_CLASS(classname) \
+	struct ww_class classname = __WW_CLASS_INITIALIZER(classname)
+
+#define DEFINE_WW_MUTEX(mutexname, ww_class) \
+	struct ww_mutex mutexname = __WW_MUTEX_INITIALIZER(mutexname, ww_class)
+
+/**
+ * ww_mutex_init - initialize the w/w mutex
+ * @lock: the mutex to be initialized
+ * @ww_class: the w/w class the mutex should belong to
+ *
+ * Initialize the w/w mutex to unlocked state and associate it with the given
+ * class.
+ *
+ * It is not allowed to initialize an already locked mutex.
+ */
+static inline void ww_mutex_init(struct ww_mutex *lock,
+				 struct ww_class *ww_class)
+{
+	__mutex_init(&lock->base, ww_class->mutex_name, &ww_class->mutex_key);
+	lock->ctx = NULL;
+#ifdef CONFIG_DEBUG_MUTEXES
+	lock->ww_class = ww_class;
+#endif
+}
+
+/**
+ * ww_acquire_init - initialize a w/w acquire context
+ * @ctx: w/w acquire context to initialize
+ * @ww_class: w/w class of the context
+ *
+ * Initializes an context to acquire multiple mutexes of the given w/w class.
+ *
+ * Context-based w/w mutex acquiring can be done in any order whatsoever within
+ * a given lock class. Deadlocks will be detected and handled with the
+ * wait/wound logic.
+ *
+ * Mixing of context-based w/w mutex acquiring and single w/w mutex locking can
+ * result in undetected deadlocks and is so forbidden. Mixing different contexts
+ * for the same w/w class when acquiring mutexes can also result in undetected
+ * deadlocks, and is hence also forbidden. Both types of abuse will be caught by
+ * enabling CONFIG_PROVE_LOCKING.
+ *
+ * Nesting of acquire contexts for _different_ w/w classes is possible, subject
+ * to the usual locking rules between different lock classes.
+ *
+ * An acquire context must be released with ww_acquire_fini by the same task
+ * before the memory is freed. It is recommended to allocate the context itself
+ * on the stack.
+ */
+static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
+				   struct ww_class *ww_class)
+{
+	ctx->task = current;
+	ctx->stamp = atomic_long_inc_return(&ww_class->stamp);
+	ctx->acquired = 0;
+#ifdef CONFIG_DEBUG_MUTEXES
+	ctx->ww_class = ww_class;
+	ctx->done_acquire = 0;
+	ctx->contending_lock = NULL;
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	debug_check_no_locks_freed((void *)ctx, sizeof(*ctx));
+	lockdep_init_map(&ctx->dep_map, ww_class->acquire_name,
+			 &ww_class->acquire_key, 0);
+	mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_);
+#endif
+#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
+	ctx->deadlock_inject_interval = 1;
+	ctx->deadlock_inject_countdown = ctx->stamp & 0xf;
+#endif
+}
+
+/**
+ * ww_acquire_done - marks the end of the acquire phase
+ * @ctx: the acquire context
+ *
+ * Marks the end of the acquire phase, any further w/w mutex lock calls using
+ * this context are forbidden.
+ *
+ * Calling this function is optional, it is just useful to document w/w mutex
+ * code and clearly designated the acquire phase from actually using the locked
+ * data structures.
+ */
+static inline void ww_acquire_done(struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+	lockdep_assert_held(ctx);
+
+	DEBUG_LOCKS_WARN_ON(ctx->done_acquire);
+	ctx->done_acquire = 1;
+#endif
+}
+
+/**
+ * ww_acquire_fini - releases a w/w acquire context
+ * @ctx: the acquire context to free
+ *
+ * Releases a w/w acquire context. This must be called _after_ all acquired w/w
+ * mutexes have been released with ww_mutex_unlock.
+ */
+static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+	mutex_release(&ctx->dep_map, 0, _THIS_IP_);
+
+	DEBUG_LOCKS_WARN_ON(ctx->acquired);
+	if (!config_enabled(CONFIG_PROVE_LOCKING))
+		/*
+		 * lockdep will normally handle this,
+		 * but fail without anyway
+		 */
+		ctx->done_acquire = 1;
+
+	if (!config_enabled(CONFIG_DEBUG_LOCK_ALLOC))
+		/* ensure ww_acquire_fini will still fail if called twice */
+		ctx->acquired = ~0U;
+#endif
+}
+
+extern int __must_check __ww_mutex_lock(struct ww_mutex *lock,
+					struct ww_acquire_ctx *ctx);
+extern int __must_check __ww_mutex_lock_interruptible(struct ww_mutex *lock,
+						      struct ww_acquire_ctx *ctx);
+
+/**
+ * ww_mutex_lock - acquire the w/w mutex
+ * @lock: the mutex to be acquired
+ * @ctx: w/w acquire context, or NULL to acquire only a single lock.
+ *
+ * Lock the w/w mutex exclusively for this task.
+ *
+ * Deadlocks within a given w/w class of locks are detected and handled with the
+ * wait/wound algorithm. If the lock isn't immediately avaiable this function
+ * will either sleep until it is (wait case). Or it selects the current context
+ * for backing off by returning -EDEADLK (wound case). Trying to acquire the
+ * same lock with the same context twice is also detected and signalled by
+ * returning -EALREADY. Returns 0 if the mutex was successfully acquired.
+ *
+ * In the wound case the caller must release all currently held w/w mutexes for
+ * the given context and then wait for this contending lock to be available by
+ * calling ww_mutex_lock_slow. Alternatively callers can opt to not acquire this
+ * lock and proceed with trying to acquire further w/w mutexes (e.g. when
+ * scanning through lru lists trying to free resources).
+ *
+ * The mutex must later on be released by the same task that
+ * acquired it. The task may not exit without first unlocking the mutex. Also,
+ * kernel memory where the mutex resides must not be freed with the mutex still
+ * locked. The mutex must first be initialized (or statically defined) before it
+ * can be locked. memset()-ing the mutex to 0 is not allowed. The mutex must be
+ * of the same w/w lock class as was used to initialize the acquire context.
+ *
+ * A mutex acquired with this function must be released with ww_mutex_unlock.
+ */
+static inline int ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	if (ctx)
+		return __ww_mutex_lock(lock, ctx);
+
+	mutex_lock(&lock->base);
+	return 0;
+}
+
+/**
+ * ww_mutex_lock_interruptible - acquire the w/w mutex, interruptible
+ * @lock: the mutex to be acquired
+ * @ctx: w/w acquire context
+ *
+ * Lock the w/w mutex exclusively for this task.
+ *
+ * Deadlocks within a given w/w class of locks are detected and handled with the
+ * wait/wound algorithm. If the lock isn't immediately avaiable this function
+ * will either sleep until it is (wait case). Or it selects the current context
+ * for backing off by returning -EDEADLK (wound case). Trying to acquire the
+ * same lock with the same context twice is also detected and signalled by
+ * returning -EALREADY. Returns 0 if the mutex was successfully acquired. If a
+ * signal arrives while waiting for the lock then this function returns -EINTR.
+ *
+ * In the wound case the caller must release all currently held w/w mutexes for
+ * the given context and then wait for this contending lock to be available by
+ * calling ww_mutex_lock_slow_interruptible. Alternatively callers can opt to
+ * not acquire this lock and proceed with trying to acquire further w/w mutexes
+ * (e.g. when scanning through lru lists trying to free resources).
+ *
+ * The mutex must later on be released by the same task that
+ * acquired it. The task may not exit without first unlocking the mutex. Also,
+ * kernel memory where the mutex resides must not be freed with the mutex still
+ * locked. The mutex must first be initialized (or statically defined) before it
+ * can be locked. memset()-ing the mutex to 0 is not allowed. The mutex must be
+ * of the same w/w lock class as was used to initialize the acquire context.
+ *
+ * A mutex acquired with this function must be released with ww_mutex_unlock.
+ */
+static inline int __must_check ww_mutex_lock_interruptible(struct ww_mutex *lock,
+							   struct ww_acquire_ctx *ctx)
+{
+	if (ctx)
+		return __ww_mutex_lock_interruptible(lock, ctx);
+	else
+		return mutex_lock_interruptible(&lock->base);
+}
+
+/**
+ * ww_mutex_lock_slow - slowpath acquiring of the w/w mutex
+ * @lock: the mutex to be acquired
+ * @ctx: w/w acquire context
+ *
+ * Acquires a w/w mutex with the given context after a wound case. This function
+ * will sleep until the lock becomes available.
+ *
+ * The caller must have released all w/w mutexes already acquired with the
+ * context and then call this function on the contended lock.
+ *
+ * Afterwards the caller may continue to (re)acquire the other w/w mutexes it
+ * needs with ww_mutex_lock. Note that the -EALREADY return code from
+ * ww_mutex_lock can be used to avoid locking this contended mutex twice.
+ *
+ * It is forbidden to call this function with any other w/w mutexes associated
+ * with the context held. It is forbidden to call this on anything else than the
+ * contending mutex.
+ *
+ * Note that the slowpath lock acquiring can also be done by calling
+ * ww_mutex_lock directly. This function here is simply to help w/w mutex
+ * locking code readability by clearly denoting the slowpath.
+ */
+static inline void
+ww_mutex_lock_slow(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
+{
+	int ret;
+#ifdef CONFIG_DEBUG_MUTEXES
+	DEBUG_LOCKS_WARN_ON(!ctx->contending_lock);
+#endif
+	ret = ww_mutex_lock(lock, ctx);
+	(void)ret;
+}
+
+/**
+ * ww_mutex_lock_slow_interruptible - slowpath acquiring of the w/w mutex, interruptible
+ * @lock: the mutex to be acquired
+ * @ctx: w/w acquire context
+ *
+ * Acquires a w/w mutex with the given context after a wound case. This function
+ * will sleep until the lock becomes available and returns 0 when the lock has
+ * been acquired. If a signal arrives while waiting for the lock then this
+ * function returns -EINTR.
+ *
+ * The caller must have released all w/w mutexes already acquired with the
+ * context and then call this function on the contended lock.
+ *
+ * Afterwards the caller may continue to (re)acquire the other w/w mutexes it
+ * needs with ww_mutex_lock. Note that the -EALREADY return code from
+ * ww_mutex_lock can be used to avoid locking this contended mutex twice.
+ *
+ * It is forbidden to call this function with any other w/w mutexes associated
+ * with the given context held. It is forbidden to call this on anything else
+ * than the contending mutex.
+ *
+ * Note that the slowpath lock acquiring can also be done by calling
+ * ww_mutex_lock_interruptible directly. This function here is simply to help
+ * w/w mutex locking code readability by clearly denoting the slowpath.
+ */
+static inline int __must_check
+ww_mutex_lock_slow_interruptible(struct ww_mutex *lock,
+				 struct ww_acquire_ctx *ctx)
+{
+#ifdef CONFIG_DEBUG_MUTEXES
+	DEBUG_LOCKS_WARN_ON(!ctx->contending_lock);
+#endif
+	return ww_mutex_lock_interruptible(lock, ctx);
+}
+
+extern void ww_mutex_unlock(struct ww_mutex *lock);
+
+/**
+ * ww_mutex_trylock - tries to acquire the w/w mutex without acquire context
+ * @lock: mutex to lock
+ *
+ * Trylocks a mutex without acquire context, so no deadlock detection is
+ * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise.
+ */
+static inline int __must_check ww_mutex_trylock(struct ww_mutex *lock)
+{
+	return mutex_trylock(&lock->base);
+}
+
+/***
+ * ww_mutex_destroy - mark a w/w mutex unusable
+ * @lock: the mutex to be destroyed
+ *
+ * This function marks the mutex uninitialized, and any subsequent
+ * use of the mutex is forbidden. The mutex must not be locked when
+ * this function is called.
+ */
+static inline void ww_mutex_destroy(struct ww_mutex *lock)
+{
+	mutex_destroy(&lock->base);
+}
+
+/**
+ * ww_mutex_is_locked - is the w/w mutex locked
+ * @lock: the mutex to be queried
+ *
+ * Returns 1 if the mutex is locked, 0 if unlocked.
+ */
+static inline bool ww_mutex_is_locked(struct ww_mutex *lock)
+{
+	return mutex_is_locked(&lock->base);
+}
+
+#endif
diff --git a/kernel/mutex.c b/kernel/mutex.c
index e581ada5faf4..ff05f4bd86eb 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -18,6 +18,7 @@
  * Also see Documentation/mutex-design.txt.
  */
 #include <linux/mutex.h>
+#include <linux/ww_mutex.h>
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
 #include <linux/export.h>
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index aad024dde3c4..6dc09d8f4c24 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -12,6 +12,7 @@
  */
 #include <linux/rwsem.h>
 #include <linux/mutex.h>
+#include <linux/ww_mutex.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/lockdep.h>
-- 
cgit v1.2.3


From 4f5e65a1cc90bbb15b9f6cdc362922af1bcc155a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 14:24:16 -0700
Subject: fput: turn "list_head delayed_fput_list" into llist_head

fput() and delayed_fput() can use llist and avoid the locking.

This is unlikely path, it is not that this change can improve
the performance, but this way the code looks simpler.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Suggested-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c    | 25 ++++++++++---------------
 include/linux/fs.h |  2 ++
 2 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file_table.c b/fs/file_table.c
index b9a77ad08b4d..b44e4c559786 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -265,18 +265,15 @@ static void __fput(struct file *file)
 	mntput(mnt);
 }
 
-static DEFINE_SPINLOCK(delayed_fput_lock);
-static LIST_HEAD(delayed_fput_list);
+static LLIST_HEAD(delayed_fput_list);
 static void delayed_fput(struct work_struct *unused)
 {
-	LIST_HEAD(head);
-	spin_lock_irq(&delayed_fput_lock);
-	list_splice_init(&delayed_fput_list, &head);
-	spin_unlock_irq(&delayed_fput_lock);
-	while (!list_empty(&head)) {
-		struct file *f = list_first_entry(&head, struct file, f_u.fu_list);
-		list_del_init(&f->f_u.fu_list);
-		__fput(f);
+	struct llist_node *node = llist_del_all(&delayed_fput_list);
+	struct llist_node *next;
+
+	for (; node; node = next) {
+		next = llist_next(node);
+		__fput(llist_entry(node, struct file, f_u.fu_llist));
 	}
 }
 
@@ -306,7 +303,6 @@ void fput(struct file *file)
 {
 	if (atomic_long_dec_and_test(&file->f_count)) {
 		struct task_struct *task = current;
-		unsigned long flags;
 
 		file_sb_list_del(file);
 		if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
@@ -320,10 +316,9 @@ void fput(struct file *file)
 			 * fput to avoid leaking *file.
 			 */
 		}
-		spin_lock_irqsave(&delayed_fput_lock, flags);
-		list_add(&file->f_u.fu_list, &delayed_fput_list);
-		schedule_work(&delayed_fput_work);
-		spin_unlock_irqrestore(&delayed_fput_lock, flags);
+
+		if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
+			schedule_work(&delayed_fput_work);
 	}
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 834c9e5113d9..d40e8e78bbd1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -10,6 +10,7 @@
 #include <linux/stat.h>
 #include <linux/cache.h>
 #include <linux/list.h>
+#include <linux/llist.h>
 #include <linux/radix-tree.h>
 #include <linux/rbtree.h>
 #include <linux/init.h>
@@ -768,6 +769,7 @@ struct file {
 	 */
 	union {
 		struct list_head	fu_list;
+		struct llist_node	fu_llist;
 		struct rcu_head 	fu_rcuhead;
 	} f_u;
 	struct path		f_path;
-- 
cgit v1.2.3


From fb4214db50b00558cc6e274c88b3f7325068e942 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 14:24:18 -0700
Subject: llist: fix/simplify llist_add() and llist_add_batch()

1. This is mostly theoretical, but llist_add*() need ACCESS_ONCE().

   Otherwise it is not guaranteed that the first cmpxchg() uses the
   same value for old_entry and new_last->next.

2. These helpers cache the result of cmpxchg() and read the initial
   value of head->first before the main loop. I do not think this
   makes sense. In the likely case cmpxchg() succeeds, otherwise
   it doesn't hurt to reload head->first.

   I think it would be better to simplify the code and simply read
   ->first before cmpxchg().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/llist.h | 19 +++++++------------
 lib/llist.c           | 15 +++++----------
 2 files changed, 12 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/llist.h b/include/linux/llist.h
index a5199f6d0e82..3e2b969d68f6 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -151,18 +151,13 @@ static inline struct llist_node *llist_next(struct llist_node *node)
  */
 static inline bool llist_add(struct llist_node *new, struct llist_head *head)
 {
-	struct llist_node *entry, *old_entry;
-
-	entry = head->first;
-	for (;;) {
-		old_entry = entry;
-		new->next = entry;
-		entry = cmpxchg(&head->first, old_entry, new);
-		if (entry == old_entry)
-			break;
-	}
-
-	return old_entry == NULL;
+	struct llist_node *first;
+
+	do {
+		new->next = first = ACCESS_ONCE(head->first);
+	} while (cmpxchg(&head->first, first, new) != first);
+
+	return !first;
 }
 
 /**
diff --git a/lib/llist.c b/lib/llist.c
index 4a15115e90f8..4a70d120138c 100644
--- a/lib/llist.c
+++ b/lib/llist.c
@@ -39,18 +39,13 @@
 bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
 		     struct llist_head *head)
 {
-	struct llist_node *entry, *old_entry;
+	struct llist_node *first;
 
-	entry = head->first;
-	for (;;) {
-		old_entry = entry;
-		new_last->next = entry;
-		entry = cmpxchg(&head->first, old_entry, new_first);
-		if (entry == old_entry)
-			break;
-	}
+	do {
+		new_last->next = first = ACCESS_ONCE(head->first);
+	} while (cmpxchg(&head->first, first, new_first) != first);
 
-	return old_entry == NULL;
+	return !first;
 }
 EXPORT_SYMBOL_GPL(llist_add_batch);
 
-- 
cgit v1.2.3


From e9a17bd73a29e5323c37ec5ffe50fc0e825d3d03 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 8 Jul 2013 14:24:19 -0700
Subject: llist: llist_add() can use llist_add_batch()

llist_add(new, head) can simply use llist_add_batch(new, new, head),
no need to duplicate the code.

This obviously uninlines llist_add() and to me this is a win. But we
can make llist_add_batch() inline if this is desirable, in this case
gcc can notice that new_first == new_last if the caller is llist_add().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/llist.h | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/llist.h b/include/linux/llist.h
index 3e2b969d68f6..cdaa7f023899 100644
--- a/include/linux/llist.h
+++ b/include/linux/llist.h
@@ -142,6 +142,9 @@ static inline struct llist_node *llist_next(struct llist_node *node)
 	return node->next;
 }
 
+extern bool llist_add_batch(struct llist_node *new_first,
+			    struct llist_node *new_last,
+			    struct llist_head *head);
 /**
  * llist_add - add a new entry
  * @new:	new entry to be added
@@ -151,13 +154,7 @@ static inline struct llist_node *llist_next(struct llist_node *node)
  */
 static inline bool llist_add(struct llist_node *new, struct llist_head *head)
 {
-	struct llist_node *first;
-
-	do {
-		new->next = first = ACCESS_ONCE(head->first);
-	} while (cmpxchg(&head->first, first, new) != first);
-
-	return !first;
+	return llist_add_batch(new, new, head);
 }
 
 /**
@@ -173,9 +170,6 @@ static inline struct llist_node *llist_del_all(struct llist_head *head)
 	return xchg(&head->first, NULL);
 }
 
-extern bool llist_add_batch(struct llist_node *new_first,
-			    struct llist_node *new_last,
-			    struct llist_head *head);
 extern struct llist_node *llist_del_first(struct llist_head *head);
 
 #endif /* LLIST_H */
-- 
cgit v1.2.3


From a95e691f9c4a6e24fdeab6d7feae6d5411fe8a69 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 14 Jul 2013 16:43:54 +0400
Subject: rpc_create_*_dir: don't bother with qstr

just pass the name

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/sunrpc/rpc_pipe_fs.h |  4 ++--
 net/sunrpc/cache.c                 | 18 +++++-------------
 net/sunrpc/clnt.c                  | 20 ++++++++------------
 net/sunrpc/rpc_pipe.c              | 14 ++++++++------
 4 files changed, 23 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h
index a7b422b33eda..aa5b582cc471 100644
--- a/include/linux/sunrpc/rpc_pipe_fs.h
+++ b/include/linux/sunrpc/rpc_pipe_fs.h
@@ -73,12 +73,12 @@ extern ssize_t rpc_pipe_generic_upcall(struct file *, struct rpc_pipe_msg *,
 extern int rpc_queue_upcall(struct rpc_pipe *, struct rpc_pipe_msg *);
 
 struct rpc_clnt;
-extern struct dentry *rpc_create_client_dir(struct dentry *, struct qstr *, struct rpc_clnt *);
+extern struct dentry *rpc_create_client_dir(struct dentry *, const char *, struct rpc_clnt *);
 extern int rpc_remove_client_dir(struct dentry *);
 
 struct cache_detail;
 extern struct dentry *rpc_create_cache_dir(struct dentry *,
-					   struct qstr *,
+					   const char *,
 					   umode_t umode,
 					   struct cache_detail *);
 extern void rpc_remove_cache_dir(struct dentry *);
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 80fe5c86efd1..b40f9567e628 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1812,19 +1812,11 @@ int sunrpc_cache_register_pipefs(struct dentry *parent,
 				 const char *name, umode_t umode,
 				 struct cache_detail *cd)
 {
-	struct qstr q;
-	struct dentry *dir;
-	int ret = 0;
-
-	q.name = name;
-	q.len = strlen(name);
-	q.hash = full_name_hash(q.name, q.len);
-	dir = rpc_create_cache_dir(parent, &q, umode, cd);
-	if (!IS_ERR(dir))
-		cd->u.pipefs.dir = dir;
-	else
-		ret = PTR_ERR(dir);
-	return ret;
+	struct dentry *dir = rpc_create_cache_dir(parent, name, umode, cd);
+	if (IS_ERR(dir))
+		return PTR_ERR(dir);
+	cd->u.pipefs.dir = dir;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(sunrpc_cache_register_pipefs);
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 5a750b9c3640..26456274b24e 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -128,9 +128,7 @@ static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb,
 {
 	static uint32_t clntid;
 	char name[15];
-	struct qstr q = { .name = name };
 	struct dentry *dir, *dentry;
-	int error;
 
 	dir = rpc_d_lookup_sb(sb, dir_name);
 	if (dir == NULL) {
@@ -138,19 +136,17 @@ static struct dentry *rpc_setup_pipedir_sb(struct super_block *sb,
 		return dir;
 	}
 	for (;;) {
-		q.len = snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++);
+		snprintf(name, sizeof(name), "clnt%x", (unsigned int)clntid++);
 		name[sizeof(name) - 1] = '\0';
-		q.hash = full_name_hash(q.name, q.len);
-		dentry = rpc_create_client_dir(dir, &q, clnt);
+		dentry = rpc_create_client_dir(dir, name, clnt);
 		if (!IS_ERR(dentry))
 			break;
-		error = PTR_ERR(dentry);
-		if (error != -EEXIST) {
-			printk(KERN_INFO "RPC: Couldn't create pipefs entry"
-					" %s/%s, error %d\n",
-					dir_name, name, error);
-			break;
-		}
+		if (dentry == ERR_PTR(-EEXIST))
+			continue;
+		printk(KERN_INFO "RPC: Couldn't create pipefs entry"
+				" %s/%s, error %ld\n",
+				dir_name, name, PTR_ERR(dentry));
+		break;
 	}
 	dput(dir);
 	return dentry;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index e7ce4b3eb0bd..63364cb5d11a 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -770,15 +770,17 @@ out_bad:
 }
 
 static struct dentry *rpc_mkdir_populate(struct dentry *parent,
-		struct qstr *name, umode_t mode, void *private,
+		const char *name, umode_t mode, void *private,
 		int (*populate)(struct dentry *, void *), void *args_populate)
 {
 	struct dentry *dentry;
+	struct qstr q = QSTR_INIT(name, strlen(name));
 	struct inode *dir = parent->d_inode;
 	int error;
 
+	q.hash = full_name_hash(q.name, q.len);
 	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
-	dentry = __rpc_lookup_create_exclusive(parent, name);
+	dentry = __rpc_lookup_create_exclusive(parent, &q);
 	if (IS_ERR(dentry))
 		goto out;
 	error = __rpc_mkdir(dir, dentry, mode, NULL, private);
@@ -925,8 +927,8 @@ static void rpc_clntdir_depopulate(struct dentry *dentry)
 
 /**
  * rpc_create_client_dir - Create a new rpc_client directory in rpc_pipefs
- * @dentry: dentry from the rpc_pipefs root to the new directory
- * @name: &struct qstr for the name
+ * @dentry: the parent of new directory
+ * @name: the name of new directory
  * @rpc_client: rpc client to associate with this directory
  *
  * This creates a directory at the given @path associated with
@@ -935,7 +937,7 @@ static void rpc_clntdir_depopulate(struct dentry *dentry)
  * later be created using rpc_mkpipe().
  */
 struct dentry *rpc_create_client_dir(struct dentry *dentry,
-				   struct qstr *name,
+				   const char *name,
 				   struct rpc_clnt *rpc_client)
 {
 	return rpc_mkdir_populate(dentry, name, S_IRUGO | S_IXUGO, NULL,
@@ -981,7 +983,7 @@ static void rpc_cachedir_depopulate(struct dentry *dentry)
 	rpc_depopulate(dentry, cache_pipefs_files, 0, 3);
 }
 
-struct dentry *rpc_create_cache_dir(struct dentry *parent, struct qstr *name,
+struct dentry *rpc_create_cache_dir(struct dentry *parent, const char *name,
 				    umode_t umode, struct cache_detail *cd)
 {
 	return rpc_mkdir_populate(parent, name, umode, NULL,
-- 
cgit v1.2.3


From 1258ca805f613025ec079d959d4a78acfb1f79d3 Mon Sep 17 00:00:00 2001
From: Chanwoo Choi <cw00.choi@samsung.com>
Date: Thu, 11 Jul 2013 13:55:58 +0900
Subject: PM / Sleep: Fix comment typo in pm_wakeup.h

Fix a comment typo (sorce -> source) in pm_wakeup.h.

[rjw: Changelog]
Signed-off-by: Chanwoo Choi <cw00.choi@samsung.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pm_wakeup.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h
index 569781faa504..a0f70808d7f4 100644
--- a/include/linux/pm_wakeup.h
+++ b/include/linux/pm_wakeup.h
@@ -36,8 +36,8 @@
  * @last_time: Monotonic clock when the wakeup source's was touched last time.
  * @prevent_sleep_time: Total time this source has been preventing autosleep.
  * @event_count: Number of signaled wakeup events.
- * @active_count: Number of times the wakeup sorce was activated.
- * @relax_count: Number of times the wakeup sorce was deactivated.
+ * @active_count: Number of times the wakeup source was activated.
+ * @relax_count: Number of times the wakeup source was deactivated.
  * @expire_count: Number of times the wakeup source's timeout has expired.
  * @wakeup_count: Number of times the wakeup source might abort suspend.
  * @active: Status of the wakeup source.
-- 
cgit v1.2.3


From 0db0628d90125193280eabb501c94feaf48fa9ab Mon Sep 17 00:00:00 2001
From: Paul Gortmaker <paul.gortmaker@windriver.com>
Date: Wed, 19 Jun 2013 14:53:51 -0400
Subject: kernel: delete __cpuinit usage from all core kernel files

The __cpuinit type of throwaway sections might have made sense
some time ago when RAM was more constrained, but now the savings
do not offset the cost and complications.  For example, the fix in
commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time")
is a good example of the nasty type of bugs that can be created
with improper use of the various __init prefixes.

After a discussion on LKML[1] it was decided that cpuinit should go
the way of devinit and be phased out.  Once all the users are gone,
we can then finally remove the macros themselves from linux/init.h.

This removes all the uses of the __cpuinit macros from C files in
the core kernel directories (kernel, init, lib, mm, and include)
that don't really have a specific maintainer.

[1] https://lkml.org/lkml/2013/5/20/589

Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 Documentation/cpu-hotplug.txt |  6 +++---
 include/linux/cpu.h           |  2 +-
 include/linux/perf_event.h    |  2 +-
 init/calibrate.c              | 13 ++++++++-----
 kernel/cpu.c                  |  6 +++---
 kernel/events/core.c          |  4 ++--
 kernel/fork.c                 |  2 +-
 kernel/hrtimer.c              |  6 +++---
 kernel/printk.c               |  2 +-
 kernel/profile.c              |  2 +-
 kernel/relay.c                |  2 +-
 kernel/sched/core.c           | 12 ++++++------
 kernel/sched/fair.c           |  2 +-
 kernel/smp.c                  |  2 +-
 kernel/smpboot.c              |  2 +-
 kernel/softirq.c              |  8 ++++----
 kernel/time/tick-sched.c      |  2 +-
 kernel/timer.c                | 10 +++++-----
 kernel/workqueue.c            |  4 ++--
 lib/Kconfig.debug             |  2 +-
 lib/earlycpio.c               |  2 +-
 lib/percpu_counter.c          |  2 +-
 mm/memcontrol.c               |  2 +-
 mm/page-writeback.c           |  4 ++--
 mm/slab.c                     | 10 +++++-----
 mm/slub.c                     |  4 ++--
 mm/vmstat.c                   |  6 +++---
 27 files changed, 62 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index edd4b4df3932..786dc82f98ce 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -267,8 +267,8 @@ Q: If i have some kernel code that needs to be aware of CPU arrival and
 A: This is what you would need in your kernel code to receive notifications.
 
 	#include <linux/cpu.h>
-	static int __cpuinit foobar_cpu_callback(struct notifier_block *nfb,
-					    unsigned long action, void *hcpu)
+	static int foobar_cpu_callback(struct notifier_block *nfb,
+				       unsigned long action, void *hcpu)
 	{
 		unsigned int cpu = (unsigned long)hcpu;
 
@@ -285,7 +285,7 @@ A: This is what you would need in your kernel code to receive notifications.
 		return NOTIFY_OK;
 	}
 
-	static struct notifier_block __cpuinitdata foobar_cpu_notifer =
+	static struct notifier_block foobar_cpu_notifer =
 	{
 	   .notifier_call = foobar_cpu_callback,
 	};
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 944f283f01c4..ab0eade73039 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -114,7 +114,7 @@ enum {
 /* Need to know about CPUs going up/down? */
 #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
 #define cpu_notifier(fn, pri) {					\
-	static struct notifier_block fn##_nb __cpuinitdata =	\
+	static struct notifier_block fn##_nb =			\
 		{ .notifier_call = fn, .priority = pri };	\
 	register_cpu_notifier(&fn##_nb);			\
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8873f82c7baa..c43f6eabad5b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -826,7 +826,7 @@ static inline void perf_restore_debug_store(void)			{ }
  */
 #define perf_cpu_notifier(fn)						\
 do {									\
-	static struct notifier_block fn##_nb __cpuinitdata =		\
+	static struct notifier_block fn##_nb =				\
 		{ .notifier_call = fn, .priority = CPU_PRI_PERF };	\
 	unsigned long cpu = smp_processor_id();				\
 	unsigned long flags;						\
diff --git a/init/calibrate.c b/init/calibrate.c
index fda0a7b0f06c..520702db9acc 100644
--- a/init/calibrate.c
+++ b/init/calibrate.c
@@ -31,7 +31,7 @@ __setup("lpj=", lpj_setup);
 #define DELAY_CALIBRATION_TICKS			((HZ < 100) ? 1 : (HZ/100))
 #define MAX_DIRECT_CALIBRATION_RETRIES		5
 
-static unsigned long __cpuinit calibrate_delay_direct(void)
+static unsigned long calibrate_delay_direct(void)
 {
 	unsigned long pre_start, start, post_start;
 	unsigned long pre_end, end, post_end;
@@ -166,7 +166,10 @@ static unsigned long __cpuinit calibrate_delay_direct(void)
 	return 0;
 }
 #else
-static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
+static unsigned long calibrate_delay_direct(void)
+{
+	return 0;
+}
 #endif
 
 /*
@@ -180,7 +183,7 @@ static unsigned long __cpuinit calibrate_delay_direct(void) {return 0;}
  */
 #define LPS_PREC 8
 
-static unsigned long __cpuinit calibrate_delay_converge(void)
+static unsigned long calibrate_delay_converge(void)
 {
 	/* First stage - slowly accelerate to find initial bounds */
 	unsigned long lpj, lpj_base, ticks, loopadd, loopadd_base, chop_limit;
@@ -254,12 +257,12 @@ static DEFINE_PER_CPU(unsigned long, cpu_loops_per_jiffy) = { 0 };
  * Architectures should override this function if a faster calibration
  * method is available.
  */
-unsigned long __attribute__((weak)) __cpuinit calibrate_delay_is_known(void)
+unsigned long __attribute__((weak)) calibrate_delay_is_known(void)
 {
 	return 0;
 }
 
-void __cpuinit calibrate_delay(void)
+void calibrate_delay(void)
 {
 	unsigned long lpj;
 	static bool printed;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 198a38883e64..b2b227b82123 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -366,7 +366,7 @@ EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 
 /* Requires cpu_add_remove_lock to be held */
-static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
+static int _cpu_up(unsigned int cpu, int tasks_frozen)
 {
 	int ret, nr_calls = 0;
 	void *hcpu = (void *)(long)cpu;
@@ -419,7 +419,7 @@ out:
 	return ret;
 }
 
-int __cpuinit cpu_up(unsigned int cpu)
+int cpu_up(unsigned int cpu)
 {
 	int err = 0;
 
@@ -618,7 +618,7 @@ core_initcall(cpu_hotplug_pm_sync_init);
  * It must be called by the arch code on the new cpu, before the new cpu
  * enables interrupts and before the "boot" cpu returns from __cpu_up().
  */
-void __cpuinit notify_cpu_starting(unsigned int cpu)
+void notify_cpu_starting(unsigned int cpu)
 {
 	unsigned long val = CPU_STARTING;
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index eba8fb5834ae..f3e9dce39bc9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7630,7 +7630,7 @@ static void __init perf_event_init_all_cpus(void)
 	}
 }
 
-static void __cpuinit perf_event_init_cpu(int cpu)
+static void perf_event_init_cpu(int cpu)
 {
 	struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
@@ -7719,7 +7719,7 @@ static struct notifier_block perf_reboot_notifier = {
 	.priority = INT_MIN,
 };
 
-static int __cpuinit
+static int
 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (long)hcpu;
diff --git a/kernel/fork.c b/kernel/fork.c
index 66635c80a813..403d2bb8a968 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1546,7 +1546,7 @@ static inline void init_idle_pids(struct pid_link *links)
 	}
 }
 
-struct task_struct * __cpuinit fork_idle(int cpu)
+struct task_struct *fork_idle(int cpu)
 {
 	struct task_struct *task;
 	task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f0f4fe29cd21..383319bae3f7 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1659,7 +1659,7 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
 /*
  * Functions related to boot-time initialization:
  */
-static void __cpuinit init_hrtimers_cpu(int cpu)
+static void init_hrtimers_cpu(int cpu)
 {
 	struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
 	int i;
@@ -1740,7 +1740,7 @@ static void migrate_hrtimers(int scpu)
 
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
+static int hrtimer_cpu_notify(struct notifier_block *self,
 					unsigned long action, void *hcpu)
 {
 	int scpu = (long)hcpu;
@@ -1773,7 +1773,7 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata hrtimers_nb = {
+static struct notifier_block hrtimers_nb = {
 	.notifier_call = hrtimer_cpu_notify,
 };
 
diff --git a/kernel/printk.c b/kernel/printk.c
index d37d45c90ae6..69b0890ed7e5 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1921,7 +1921,7 @@ void resume_console(void)
  * called when a new CPU comes online (or fails to come up), and ensures
  * that any such output gets printed.
  */
-static int __cpuinit console_cpu_notify(struct notifier_block *self,
+static int console_cpu_notify(struct notifier_block *self,
 	unsigned long action, void *hcpu)
 {
 	switch (action) {
diff --git a/kernel/profile.c b/kernel/profile.c
index 0bf400737660..6631e1ef55ab 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -331,7 +331,7 @@ out:
 	put_cpu();
 }
 
-static int __cpuinit profile_cpu_callback(struct notifier_block *info,
+static int profile_cpu_callback(struct notifier_block *info,
 					unsigned long action, void *__cpu)
 {
 	int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/relay.c b/kernel/relay.c
index b91488ba2e5a..5001c9887db1 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -516,7 +516,7 @@ static void setup_callbacks(struct rchan *chan,
  *
  * 	Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD)
  */
-static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
+static int relay_hotcpu_callback(struct notifier_block *nb,
 				unsigned long action,
 				void *hcpu)
 {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d8eb4525e76..b7c32cb7bfeb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4133,7 +4133,7 @@ void show_state_filter(unsigned long state_filter)
 		debug_show_all_locks();
 }
 
-void __cpuinit init_idle_bootup_task(struct task_struct *idle)
+void init_idle_bootup_task(struct task_struct *idle)
 {
 	idle->sched_class = &idle_sched_class;
 }
@@ -4146,7 +4146,7 @@ void __cpuinit init_idle_bootup_task(struct task_struct *idle)
  * NOTE: this function does not set the idle thread's NEED_RESCHED
  * flag, to make booting more robust.
  */
-void __cpuinit init_idle(struct task_struct *idle, int cpu)
+void init_idle(struct task_struct *idle, int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
@@ -4630,7 +4630,7 @@ static void set_rq_offline(struct rq *rq)
  * migration_call - callback that gets triggered when a CPU is added.
  * Here we can start up the necessary migration thread for the new CPU.
  */
-static int __cpuinit
+static int
 migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int cpu = (long)hcpu;
@@ -4684,12 +4684,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
  * happens before everything else.  This has to be lower priority than
  * the notifier in the perf_event subsystem, though.
  */
-static struct notifier_block __cpuinitdata migration_notifier = {
+static struct notifier_block migration_notifier = {
 	.notifier_call = migration_call,
 	.priority = CPU_PRI_MIGRATION,
 };
 
-static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+static int sched_cpu_active(struct notifier_block *nfb,
 				      unsigned long action, void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
@@ -4702,7 +4702,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
 	}
 }
 
-static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+static int sched_cpu_inactive(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c527449..bb456f44b7b1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5506,7 +5506,7 @@ void nohz_balance_enter_idle(int cpu)
 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 }
 
-static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
+static int sched_ilb_notifier(struct notifier_block *nfb,
 					unsigned long action, void *hcpu)
 {
 	switch (action & ~CPU_TASKS_FROZEN) {
diff --git a/kernel/smp.c b/kernel/smp.c
index 4dba0f7b72ad..fe9f773d7114 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -73,7 +73,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
+static struct notifier_block hotplug_cfd_notifier = {
 	.notifier_call		= hotplug_cfd,
 };
 
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 02fc5c933673..eb89e1807408 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -24,7 +24,7 @@
  */
 static DEFINE_PER_CPU(struct task_struct *, idle_threads);
 
-struct task_struct * __cpuinit idle_thread_get(unsigned int cpu)
+struct task_struct *idle_thread_get(unsigned int cpu)
 {
 	struct task_struct *tsk = per_cpu(idle_threads, cpu);
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ca25e6e704a2..be3d3514c325 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -699,7 +699,7 @@ void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
 }
 EXPORT_SYMBOL(send_remote_softirq);
 
-static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
+static int remote_softirq_cpu_notify(struct notifier_block *self,
 					       unsigned long action, void *hcpu)
 {
 	/*
@@ -728,7 +728,7 @@ static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
+static struct notifier_block remote_softirq_cpu_notifier = {
 	.notifier_call	= remote_softirq_cpu_notify,
 };
 
@@ -830,7 +830,7 @@ static void takeover_tasklets(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __cpuinit cpu_callback(struct notifier_block *nfb,
+static int cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
@@ -845,7 +845,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata cpu_nfb = {
+static struct notifier_block cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 69601726a745..e80183f4a6c4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -298,7 +298,7 @@ static int __init tick_nohz_full_setup(char *str)
 }
 __setup("nohz_full=", tick_nohz_full_setup);
 
-static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
+static int tick_nohz_cpu_down_callback(struct notifier_block *nfb,
 						 unsigned long action,
 						 void *hcpu)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index 15bc1b41021d..4296d13db3d1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1505,11 +1505,11 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 
-static int __cpuinit init_timers_cpu(int cpu)
+static int init_timers_cpu(int cpu)
 {
 	int j;
 	struct tvec_base *base;
-	static char __cpuinitdata tvec_base_done[NR_CPUS];
+	static char tvec_base_done[NR_CPUS];
 
 	if (!tvec_base_done[cpu]) {
 		static char boot_done;
@@ -1577,7 +1577,7 @@ static void migrate_timer_list(struct tvec_base *new_base, struct list_head *hea
 	}
 }
 
-static void __cpuinit migrate_timers(int cpu)
+static void migrate_timers(int cpu)
 {
 	struct tvec_base *old_base;
 	struct tvec_base *new_base;
@@ -1610,7 +1610,7 @@ static void __cpuinit migrate_timers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __cpuinit timer_cpu_notify(struct notifier_block *self,
+static int timer_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -1635,7 +1635,7 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata timers_nb = {
+static struct notifier_block timers_nb = {
 	.notifier_call	= timer_cpu_notify,
 };
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f02c4a4a0c3c..0b72e816b8d0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4644,7 +4644,7 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
  * Workqueues should be brought up before normal priority CPU notifiers.
  * This will be registered high priority CPU notifier.
  */
-static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
+static int workqueue_cpu_up_callback(struct notifier_block *nfb,
 					       unsigned long action,
 					       void *hcpu)
 {
@@ -4697,7 +4697,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
  * Workqueues should be brought down after normal priority CPU notifiers.
  * This will be registered as low priority CPU notifier.
  */
-static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
+static int workqueue_cpu_down_callback(struct notifier_block *nfb,
 						 unsigned long action,
 						 void *hcpu)
 {
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 98ac17ed6222..1501aa553221 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -238,7 +238,7 @@ config DEBUG_SECTION_MISMATCH
 	  any use of code/data previously in these sections would
 	  most likely result in an oops.
 	  In the code, functions and variables are annotated with
-	  __init, __cpuinit, etc. (see the full list in include/linux/init.h),
+	  __init,, etc. (see the full list in include/linux/init.h),
 	  which results in the code/data being placed in specific sections.
 	  The section mismatch analysis is always performed after a full
 	  kernel build, and enabling this option causes the following
diff --git a/lib/earlycpio.c b/lib/earlycpio.c
index 8078ef49cb79..7aa7ce250c94 100644
--- a/lib/earlycpio.c
+++ b/lib/earlycpio.c
@@ -63,7 +63,7 @@ enum cpio_fields {
  *          the match returned an empty filename string.
  */
 
-struct cpio_data __cpuinit find_cpio_data(const char *path, void *data,
+struct cpio_data find_cpio_data(const char *path, void *data,
 					  size_t len,  long *offset)
 {
 	const size_t cpio_header_len = 8*C_NFIELDS - 2;
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 1fc23a3277e1..93c5d5ecff4e 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -158,7 +158,7 @@ static void compute_batch_value(void)
 	percpu_counter_batch = max(32, nr*2);
 }
 
-static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
+static int percpu_counter_hotcpu_callback(struct notifier_block *nb,
 					unsigned long action, void *hcpu)
 {
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d12ca6f3c293..00a7a664b9c1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2522,7 +2522,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
 	spin_unlock(&memcg->pcp_counter_lock);
 }
 
-static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
+static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
 					unsigned long action,
 					void *hcpu)
 {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4514ad7415c3..3f0c895c71fe 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1619,7 +1619,7 @@ void writeback_set_ratelimit(void)
 		ratelimit_pages = 16;
 }
 
-static int __cpuinit
+static int
 ratelimit_handler(struct notifier_block *self, unsigned long action,
 		  void *hcpu)
 {
@@ -1634,7 +1634,7 @@ ratelimit_handler(struct notifier_block *self, unsigned long action,
 	}
 }
 
-static struct notifier_block __cpuinitdata ratelimit_nb = {
+static struct notifier_block ratelimit_nb = {
 	.notifier_call	= ratelimit_handler,
 	.next		= NULL,
 };
diff --git a/mm/slab.c b/mm/slab.c
index 35cb0c861508..2580db062df9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -787,7 +787,7 @@ static void next_reap_node(void)
  * the CPUs getting into lockstep and contending for the global cache chain
  * lock.
  */
-static void __cpuinit start_cpu_timer(int cpu)
+static void start_cpu_timer(int cpu)
 {
 	struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
 
@@ -1186,7 +1186,7 @@ static inline int slabs_tofree(struct kmem_cache *cachep,
 	return (n->free_objects + cachep->num - 1) / cachep->num;
 }
 
-static void __cpuinit cpuup_canceled(long cpu)
+static void cpuup_canceled(long cpu)
 {
 	struct kmem_cache *cachep;
 	struct kmem_cache_node *n = NULL;
@@ -1251,7 +1251,7 @@ free_array_cache:
 	}
 }
 
-static int __cpuinit cpuup_prepare(long cpu)
+static int cpuup_prepare(long cpu)
 {
 	struct kmem_cache *cachep;
 	struct kmem_cache_node *n = NULL;
@@ -1334,7 +1334,7 @@ bad:
 	return -ENOMEM;
 }
 
-static int __cpuinit cpuup_callback(struct notifier_block *nfb,
+static int cpuup_callback(struct notifier_block *nfb,
 				    unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -1390,7 +1390,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 	return notifier_from_errno(err);
 }
 
-static struct notifier_block __cpuinitdata cpucache_notifier = {
+static struct notifier_block cpucache_notifier = {
 	&cpuup_callback, NULL, 0
 };
 
diff --git a/mm/slub.c b/mm/slub.c
index 3b482c863002..2b02d666bf63 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3773,7 +3773,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
  * Use the cpu notifier to insure that the cpu slabs are flushed when
  * necessary.
  */
-static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
+static int slab_cpuup_callback(struct notifier_block *nfb,
 		unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -3799,7 +3799,7 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata slab_notifier = {
+static struct notifier_block slab_notifier = {
 	.notifier_call = slab_cpuup_callback
 };
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f42745e65780..20c2ef4458fa 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1182,7 +1182,7 @@ static void vmstat_update(struct work_struct *w)
 		round_jiffies_relative(sysctl_stat_interval));
 }
 
-static void __cpuinit start_cpu_timer(int cpu)
+static void start_cpu_timer(int cpu)
 {
 	struct delayed_work *work = &per_cpu(vmstat_work, cpu);
 
@@ -1194,7 +1194,7 @@ static void __cpuinit start_cpu_timer(int cpu)
  * Use the cpu notifier to insure that the thresholds are recalculated
  * when necessary.
  */
-static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
+static int vmstat_cpuup_callback(struct notifier_block *nfb,
 		unsigned long action,
 		void *hcpu)
 {
@@ -1226,7 +1226,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __cpuinitdata vmstat_notifier =
+static struct notifier_block vmstat_notifier =
 	{ &vmstat_cpuup_callback, NULL, 0 };
 #endif
 
-- 
cgit v1.2.3


From b9b3259746d77f4fcb786e2a43c25bcc40773755 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 14 Jul 2013 16:05:51 -0700
Subject: sysfs.h: add __ATTR_RW() macro

A number of parts of the kernel created their own version of this, might
as well have the sysfs core provide it instead.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sysfs.h | 2 ++
 kernel/events/core.c  | 2 --
 mm/backing-dev.c      | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index e2cee22f578a..9cd20c8404e5 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -79,6 +79,8 @@ struct attribute_group {
 	.show	= _name##_show,					\
 }
 
+#define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store)
+
 #define __ATTR_NULL { .attr = { .name = NULL } }
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/events/core.c b/kernel/events/core.c
index eba8fb5834ae..dd9878029d1f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6234,8 +6234,6 @@ perf_event_mux_interval_ms_store(struct device *dev,
 	return count;
 }
 
-#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
-
 static struct device_attribute pmu_dev_attrs[] = {
 	__ATTR_RO(type),
 	__ATTR_RW(perf_event_mux_interval_ms),
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d014ee5fcbbd..e04454cdb33f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -232,8 +232,6 @@ static ssize_t stable_pages_required_show(struct device *dev,
 			bdi_cap_stable_pages_required(bdi) ? 1 : 0);
 }
 
-#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
-
 static struct device_attribute bdi_dev_attrs[] = {
 	__ATTR_RW(read_ahead_kb),
 	__ATTR_RW(min_ratio),
-- 
cgit v1.2.3


From f2f37f58b1b933b06d6d84e80a31a1b500fb0db2 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 14 Jul 2013 16:05:52 -0700
Subject: sysfs.h: add ATTRIBUTE_GROUPS() macro

To make it easier for driver subsystems to work with attribute groups,
create the ATTRIBUTE_GROUPS macro to remove some of the repetitive
typing for the most common use for attribute groups.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sysfs.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 9cd20c8404e5..f62ff01e5f59 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -94,6 +94,15 @@ struct attribute_group {
 #define __ATTR_IGNORE_LOCKDEP	__ATTR
 #endif
 
+#define ATTRIBUTE_GROUPS(name)					\
+static const struct attribute_group name##_group = {		\
+	.attrs = name##_attrs,					\
+};								\
+static const struct attribute_group *name##_groups[] = {	\
+	&name##_group,						\
+	NULL,							\
+}
+
 #define attr_name(_attr) (_attr).attr.name
 
 struct file;
-- 
cgit v1.2.3


From e4b63603c2a1e2c4db3de11b0f2b17360a7695bb Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 14 Jul 2013 16:05:53 -0700
Subject: sysfs.h: add BIN_ATTR macro

This makes it easier to create static binary attributes, which is needed
in a number of drivers, instead of "open coding" them.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sysfs.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index f62ff01e5f59..d50a96b9bb6d 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -132,6 +132,15 @@ struct bin_attribute {
  */
 #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr)
 
+/* macro to create static binary attributes easier */
+#define BIN_ATTR(_name, _mode, _read, _write, _size)		\
+struct bin_attribute bin_attr_##_name = {			\
+	.attr = {.name = __stringify(_name), .mode = _mode },	\
+	.read	= _read,					\
+	.write	= _write,					\
+	.size	= _size,					\
+}
+
 struct sysfs_ops {
 	ssize_t	(*show)(struct kobject *, struct attribute *,char *);
 	ssize_t	(*store)(struct kobject *,struct attribute *,const char *, size_t);
-- 
cgit v1.2.3


From ced321bf9151535f85779b0004c93529f860b2a4 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 14 Jul 2013 16:05:54 -0700
Subject: driver core: device.h: add RW and RO attribute macros

Make it easier to create attributes without having to always audit the
mode settings.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/device.h | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index bcf8c0d4cd98..f207a8f49f80 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -47,7 +47,11 @@ struct bus_attribute {
 };
 
 #define BUS_ATTR(_name, _mode, _show, _store)	\
-struct bus_attribute bus_attr_##_name = __ATTR(_name, _mode, _show, _store)
+	struct bus_attribute bus_attr_##_name = __ATTR(_name, _mode, _show, _store)
+#define BUS_ATTR_RW(_name) \
+	struct bus_attribute bus_attr_##_name = __ATTR_RW(_name)
+#define BUS_ATTR_RO(_name) \
+	struct bus_attribute bus_attr_##_name = __ATTR_RO(_name)
 
 extern int __must_check bus_create_file(struct bus_type *,
 					struct bus_attribute *);
@@ -261,9 +265,12 @@ struct driver_attribute {
 			 size_t count);
 };
 
-#define DRIVER_ATTR(_name, _mode, _show, _store)	\
-struct driver_attribute driver_attr_##_name =		\
-	__ATTR(_name, _mode, _show, _store)
+#define DRIVER_ATTR(_name, _mode, _show, _store) \
+	struct driver_attribute driver_attr_##_name = __ATTR(_name, _mode, _show, _store)
+#define DRIVER_ATTR_RW(_name) \
+	struct driver_attribute driver_attr_##_name = __ATTR_RW(_name)
+#define DRIVER_ATTR_RO(_name) \
+	struct driver_attribute driver_attr_##_name = __ATTR_RO(_name)
 
 extern int __must_check driver_create_file(struct device_driver *driver,
 					const struct driver_attribute *attr);
@@ -414,8 +421,12 @@ struct class_attribute {
 				 const struct class_attribute *attr);
 };
 
-#define CLASS_ATTR(_name, _mode, _show, _store)			\
-struct class_attribute class_attr_##_name = __ATTR(_name, _mode, _show, _store)
+#define CLASS_ATTR(_name, _mode, _show, _store) \
+	struct class_attribute class_attr_##_name = __ATTR(_name, _mode, _show, _store)
+#define CLASS_ATTR_RW(_name) \
+	struct class_attribute class_attr_##_name = __ATTR_RW(_name)
+#define CLASS_ATTR_RO(_name) \
+	struct class_attribute class_attr_##_name = __ATTR_RO(_name)
 
 extern int __must_check class_create_file(struct class *class,
 					  const struct class_attribute *attr);
@@ -423,7 +434,6 @@ extern void class_remove_file(struct class *class,
 			      const struct class_attribute *attr);
 
 /* Simple class attribute that is just a static string */
-
 struct class_attribute_string {
 	struct class_attribute attr;
 	char *str;
@@ -512,6 +522,10 @@ ssize_t device_store_bool(struct device *dev, struct device_attribute *attr,
 
 #define DEVICE_ATTR(_name, _mode, _show, _store) \
 	struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store)
+#define DEVICE_ATTR_RW(_name) \
+	struct device_attribute dev_attr_##_name = __ATTR_RW(_name)
+#define DEVICE_ATTR_RO(_name) \
+	struct device_attribute dev_attr_##_name = __ATTR_RO(_name)
 #define DEVICE_ULONG_ATTR(_name, _mode, _var) \
 	struct dev_ext_attribute dev_attr_##_name = \
 		{ __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) }
-- 
cgit v1.2.3


From 6ab9cea16075ea707022753395f340b67f64304c Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 14 Jul 2013 16:05:55 -0700
Subject: sysfs: add support for binary attributes in groups

groups should be able to support binary attributes, just like it
supports "normal" attributes.  This lets us only handle one type of
structure, groups, throughout the driver core and subsystems, making
binary attributes a "full fledged" part of the driver model, and not
something just "tacked on".

Reported-by: Oliver Schinagl <oliver@schinagl.nl>
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/group.c      | 66 +++++++++++++++++++++++++++++++++++----------------
 include/linux/sysfs.h |  4 ++--
 2 files changed, 48 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index aec3d5c98c94..e5719c6095c3 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -20,38 +20,64 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 			 const struct attribute_group *grp)
 {
 	struct attribute *const* attr;
-	int i;
+	struct bin_attribute *const* bin_attr;
 
-	for (i = 0, attr = grp->attrs; *attr; i++, attr++)
-		sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+	if (grp->attrs)
+		for (attr = grp->attrs; *attr; attr++)
+			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
+	if (grp->bin_attrs)
+		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
+			sysfs_remove_bin_file(kobj, *bin_attr);
 }
 
 static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 			const struct attribute_group *grp, int update)
 {
 	struct attribute *const* attr;
+	struct bin_attribute *const* bin_attr;
 	int error = 0, i;
 
-	for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
-		umode_t mode = 0;
+	if (grp->attrs) {
+		for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) {
+			umode_t mode = 0;
+
+			/*
+			 * In update mode, we're changing the permissions or
+			 * visibility.  Do this by first removing then
+			 * re-adding (if required) the file.
+			 */
+			if (update)
+				sysfs_hash_and_remove(dir_sd, NULL,
+						      (*attr)->name);
+			if (grp->is_visible) {
+				mode = grp->is_visible(kobj, *attr, i);
+				if (!mode)
+					continue;
+			}
+			error = sysfs_add_file_mode(dir_sd, *attr,
+						    SYSFS_KOBJ_ATTR,
+						    (*attr)->mode | mode);
+			if (unlikely(error))
+				break;
+		}
+		if (error) {
+			remove_files(dir_sd, kobj, grp);
+			goto exit;
+		}
+	}
 
-		/* in update mode, we're changing the permissions or
-		 * visibility.  Do this by first removing then
-		 * re-adding (if required) the file */
-		if (update)
-			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
-		if (grp->is_visible) {
-			mode = grp->is_visible(kobj, *attr, i);
-			if (!mode)
-				continue;
+	if (grp->bin_attrs) {
+		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
+			if (update)
+				sysfs_remove_bin_file(kobj, *bin_attr);
+			error = sysfs_create_bin_file(kobj, *bin_attr);
+			if (error)
+				break;
 		}
-		error = sysfs_add_file_mode(dir_sd, *attr, SYSFS_KOBJ_ATTR,
-					    (*attr)->mode | mode);
-		if (unlikely(error))
-			break;
+		if (error)
+			remove_files(dir_sd, kobj, grp);
 	}
-	if (error)
-		remove_files(dir_sd, kobj, grp);
+exit:
 	return error;
 }
 
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index d50a96b9bb6d..2c3b6a30697d 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -21,6 +21,7 @@
 
 struct kobject;
 struct module;
+struct bin_attribute;
 enum kobj_ns_type;
 
 struct attribute {
@@ -59,10 +60,9 @@ struct attribute_group {
 	umode_t			(*is_visible)(struct kobject *,
 					      struct attribute *, int);
 	struct attribute	**attrs;
+	struct bin_attribute	**bin_attrs;
 };
 
-
-
 /**
  * Use these macros to make defining attributes easier. See include/linux/device.h
  * for examples..
-- 
cgit v1.2.3


From 39ef311204941ddd01ea2950d6220c8ccc710d15 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Sun, 14 Jul 2013 16:05:57 -0700
Subject: driver core: Introduce device_create_groups

device_create_groups lets callers create devices as well as associated
sysfs attributes with a single call. This avoids race conditions seen
if sysfs attributes on new devices are created later.

[fixed up comment block placement and add checks for printk buffer
formats - gregkh]

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Cc: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 111 ++++++++++++++++++++++++++++++++++++-------------
 include/linux/device.h |   5 +++
 2 files changed, 88 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index dc3ea237f086..a8aae1823f73 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1667,34 +1667,11 @@ static void device_create_release(struct device *dev)
 	kfree(dev);
 }
 
-/**
- * device_create_vargs - creates a device and registers it with sysfs
- * @class: pointer to the struct class that this device should be registered to
- * @parent: pointer to the parent struct device of this new device, if any
- * @devt: the dev_t for the char device to be added
- * @drvdata: the data to be added to the device for callbacks
- * @fmt: string for the device's name
- * @args: va_list for the device's name
- *
- * This function can be used by char device classes.  A struct device
- * will be created in sysfs, registered to the specified class.
- *
- * A "dev" file will be created, showing the dev_t for the device, if
- * the dev_t is not 0,0.
- * If a pointer to a parent struct device is passed in, the newly created
- * struct device will be a child of that device in sysfs.
- * The pointer to the struct device will be returned from the call.
- * Any further sysfs files that might be required can be created using this
- * pointer.
- *
- * Returns &struct device pointer on success, or ERR_PTR() on error.
- *
- * Note: the struct class passed to this function must have previously
- * been created with a call to class_create().
- */
-struct device *device_create_vargs(struct class *class, struct device *parent,
-				   dev_t devt, void *drvdata, const char *fmt,
-				   va_list args)
+static struct device *
+device_create_groups_vargs(struct class *class, struct device *parent,
+			   dev_t devt, void *drvdata,
+			   const struct attribute_group **groups,
+			   const char *fmt, va_list args)
 {
 	struct device *dev = NULL;
 	int retval = -ENODEV;
@@ -1711,6 +1688,7 @@ struct device *device_create_vargs(struct class *class, struct device *parent,
 	dev->devt = devt;
 	dev->class = class;
 	dev->parent = parent;
+	dev->groups = groups;
 	dev->release = device_create_release;
 	dev_set_drvdata(dev, drvdata);
 
@@ -1728,6 +1706,39 @@ error:
 	put_device(dev);
 	return ERR_PTR(retval);
 }
+
+/**
+ * device_create_vargs - creates a device and registers it with sysfs
+ * @class: pointer to the struct class that this device should be registered to
+ * @parent: pointer to the parent struct device of this new device, if any
+ * @devt: the dev_t for the char device to be added
+ * @drvdata: the data to be added to the device for callbacks
+ * @fmt: string for the device's name
+ * @args: va_list for the device's name
+ *
+ * This function can be used by char device classes.  A struct device
+ * will be created in sysfs, registered to the specified class.
+ *
+ * A "dev" file will be created, showing the dev_t for the device, if
+ * the dev_t is not 0,0.
+ * If a pointer to a parent struct device is passed in, the newly created
+ * struct device will be a child of that device in sysfs.
+ * The pointer to the struct device will be returned from the call.
+ * Any further sysfs files that might be required can be created using this
+ * pointer.
+ *
+ * Returns &struct device pointer on success, or ERR_PTR() on error.
+ *
+ * Note: the struct class passed to this function must have previously
+ * been created with a call to class_create().
+ */
+struct device *device_create_vargs(struct class *class, struct device *parent,
+				   dev_t devt, void *drvdata, const char *fmt,
+				   va_list args)
+{
+	return device_create_groups_vargs(class, parent, devt, drvdata, NULL,
+					  fmt, args);
+}
 EXPORT_SYMBOL_GPL(device_create_vargs);
 
 /**
@@ -1767,6 +1778,50 @@ struct device *device_create(struct class *class, struct device *parent,
 }
 EXPORT_SYMBOL_GPL(device_create);
 
+/**
+ * device_create_with_groups - creates a device and registers it with sysfs
+ * @class: pointer to the struct class that this device should be registered to
+ * @parent: pointer to the parent struct device of this new device, if any
+ * @devt: the dev_t for the char device to be added
+ * @drvdata: the data to be added to the device for callbacks
+ * @groups: NULL-terminated list of attribute groups to be created
+ * @fmt: string for the device's name
+ *
+ * This function can be used by char device classes.  A struct device
+ * will be created in sysfs, registered to the specified class.
+ * Additional attributes specified in the groups parameter will also
+ * be created automatically.
+ *
+ * A "dev" file will be created, showing the dev_t for the device, if
+ * the dev_t is not 0,0.
+ * If a pointer to a parent struct device is passed in, the newly created
+ * struct device will be a child of that device in sysfs.
+ * The pointer to the struct device will be returned from the call.
+ * Any further sysfs files that might be required can be created using this
+ * pointer.
+ *
+ * Returns &struct device pointer on success, or ERR_PTR() on error.
+ *
+ * Note: the struct class passed to this function must have previously
+ * been created with a call to class_create().
+ */
+struct device *device_create_with_groups(struct class *class,
+					 struct device *parent, dev_t devt,
+					 void *drvdata,
+					 const struct attribute_group **groups,
+					 const char *fmt, ...)
+{
+	va_list vargs;
+	struct device *dev;
+
+	va_start(vargs, fmt);
+	dev = device_create_groups_vargs(class, parent, devt, drvdata, groups,
+					 fmt, vargs);
+	va_end(vargs);
+	return dev;
+}
+EXPORT_SYMBOL_GPL(device_create_with_groups);
+
 static int __match_devt(struct device *dev, const void *data)
 {
 	const dev_t *devt = data;
diff --git a/include/linux/device.h b/include/linux/device.h
index f207a8f49f80..bd5931e89f74 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -938,6 +938,11 @@ extern __printf(5, 6)
 struct device *device_create(struct class *cls, struct device *parent,
 			     dev_t devt, void *drvdata,
 			     const char *fmt, ...);
+extern __printf(6, 7)
+struct device *device_create_with_groups(struct class *cls,
+			     struct device *parent, dev_t devt, void *drvdata,
+			     const struct attribute_group **groups,
+			     const char *fmt, ...);
 extern void device_destroy(struct class *cls, dev_t devt);
 
 /*
-- 
cgit v1.2.3


From d05a6f96c76062b5f25858ac02cf677602076f7e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 14 Jul 2013 16:05:58 -0700
Subject: driver core: add default groups to struct class

We should be using groups, not attribute lists, for classes to allow
subdirectories, and soon, binary files.  Groups are just more flexible
overall, so add them.

The dev_attrs list will go away after all in-kernel users are converted
to use dev_groups.

Reviewed-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/core.c    | 9 ++++++++-
 include/linux/device.h | 4 +++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/core.c b/drivers/base/core.c
index a8aae1823f73..8856d74545d9 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -528,9 +528,12 @@ static int device_add_attrs(struct device *dev)
 	int error;
 
 	if (class) {
-		error = device_add_attributes(dev, class->dev_attrs);
+		error = device_add_groups(dev, class->dev_groups);
 		if (error)
 			return error;
+		error = device_add_attributes(dev, class->dev_attrs);
+		if (error)
+			goto err_remove_class_groups;
 		error = device_add_bin_attributes(dev, class->dev_bin_attrs);
 		if (error)
 			goto err_remove_class_attrs;
@@ -563,6 +566,9 @@ static int device_add_attrs(struct device *dev)
  err_remove_class_attrs:
 	if (class)
 		device_remove_attributes(dev, class->dev_attrs);
+ err_remove_class_groups:
+	if (class)
+		device_remove_groups(dev, class->dev_groups);
 
 	return error;
 }
@@ -581,6 +587,7 @@ static void device_remove_attrs(struct device *dev)
 	if (class) {
 		device_remove_attributes(dev, class->dev_attrs);
 		device_remove_bin_attributes(dev, class->dev_bin_attrs);
+		device_remove_groups(dev, class->dev_groups);
 	}
 }
 
diff --git a/include/linux/device.h b/include/linux/device.h
index bd5931e89f74..22b546a58591 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -320,6 +320,7 @@ int subsys_virtual_register(struct bus_type *subsys,
  * @name:	Name of the class.
  * @owner:	The module owner.
  * @class_attrs: Default attributes of this class.
+ * @dev_groups:	Default attributes of the devices that belong to the class.
  * @dev_attrs:	Default attributes of the devices belong to the class.
  * @dev_bin_attrs: Default binary attributes of the devices belong to the class.
  * @dev_kobj:	The kobject that represents this class and links it into the hierarchy.
@@ -349,7 +350,8 @@ struct class {
 	struct module		*owner;
 
 	struct class_attribute		*class_attrs;
-	struct device_attribute		*dev_attrs;
+	struct device_attribute		*dev_attrs;	/* use dev_groups instead */
+	const struct attribute_group	**dev_groups;
 	struct bin_attribute		*dev_bin_attrs;
 	struct kobject			*dev_kobj;
 
-- 
cgit v1.2.3


From 3493f69f4c4e8703961919a9a56c2d2e6a25b46f Mon Sep 17 00:00:00 2001
From: Oliver Schinagl <oliver@schinagl.nl>
Date: Sun, 14 Jul 2013 16:05:59 -0700
Subject: sysfs: add more helper macro's for (bin_)attribute(_groups)

With the recent changes to sysfs there's various helper macro's.
However there's no RW, RO BIN_ helper macro's. This patch adds them.

Signed-off-by: Oliver Schinagl <oliver@schinagl.nl>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sysfs.h | 51 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 38 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 2c3b6a30697d..d907a7328025 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -17,6 +17,7 @@
 #include <linux/list.h>
 #include <linux/lockdep.h>
 #include <linux/kobject_ns.h>
+#include <linux/stat.h>
 #include <linux/atomic.h>
 
 struct kobject;
@@ -94,15 +95,18 @@ struct attribute_group {
 #define __ATTR_IGNORE_LOCKDEP	__ATTR
 #endif
 
-#define ATTRIBUTE_GROUPS(name)					\
-static const struct attribute_group name##_group = {		\
-	.attrs = name##_attrs,					\
-};								\
-static const struct attribute_group *name##_groups[] = {	\
-	&name##_group,						\
+#define __ATTRIBUTE_GROUPS(_name)				\
+static const struct attribute_group *_name##_groups[] = {	\
+	&_name##_group,						\
 	NULL,							\
 }
 
+#define ATTRIBUTE_GROUPS(_name)					\
+static const struct attribute_group _name##_group = {		\
+	.attrs = _name##_attrs,					\
+};								\
+__ATTRIBUTE_GROUPS(_name)
+
 #define attr_name(_attr) (_attr).attr.name
 
 struct file;
@@ -132,15 +136,36 @@ struct bin_attribute {
  */
 #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr)
 
-/* macro to create static binary attributes easier */
-#define BIN_ATTR(_name, _mode, _read, _write, _size)		\
-struct bin_attribute bin_attr_##_name = {			\
-	.attr = {.name = __stringify(_name), .mode = _mode },	\
-	.read	= _read,					\
-	.write	= _write,					\
-	.size	= _size,					\
+/* macros to create static binary attributes easier */
+#define __BIN_ATTR(_name, _mode, _read, _write, _size) {		\
+	.attr = { .name = __stringify(_name), .mode = _mode },		\
+	.read	= _read,						\
+	.write	= _write,						\
+	.size	= _size,						\
+}
+
+#define __BIN_ATTR_RO(_name, _size) {					\
+	.attr	= { .name = __stringify(_name), .mode = S_IRUGO },	\
+	.read	= _name##_read,						\
+	.size	= _size,						\
 }
 
+#define __BIN_ATTR_RW(_name, _size) __BIN_ATTR(_name,			\
+				   (S_IWUSR | S_IRUGO), _name##_read,	\
+				   _name##_write)
+
+#define __BIN_ATTR_NULL __ATTR_NULL
+
+#define BIN_ATTR(_name, _mode, _read, _write, _size)			\
+struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read,	\
+					_write, _size)
+
+#define BIN_ATTR_RO(_name, _size)					\
+struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size)
+
+#define BIN_ATTR_RW(_name, _size)					\
+struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size)
+
 struct sysfs_ops {
 	ssize_t	(*show)(struct kobject *, struct attribute *,char *);
 	ssize_t	(*store)(struct kobject *,struct attribute *,const char *, size_t);
-- 
cgit v1.2.3


From aa01aa3ca205ea04f44423a58bae38aec886fb96 Mon Sep 17 00:00:00 2001
From: Oliver Schinagl <oliver@schinagl.nl>
Date: Sun, 14 Jul 2013 16:06:00 -0700
Subject: sysfs: use file mode defines from stat.h

With the last patches stat.h was included to the header, and thus those
permission defines should be used.

Signed-off-by: Oliver Schinagl <oliver@schinagl.nl>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/sysfs.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index d907a7328025..9e8a9b555ad6 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -69,18 +69,19 @@ struct attribute_group {
  * for examples..
  */
 
-#define __ATTR(_name,_mode,_show,_store) { \
-	.attr = {.name = __stringify(_name), .mode = _mode },	\
-	.show	= _show,					\
-	.store	= _store,					\
+#define __ATTR(_name,_mode,_show,_store) { 				\
+	.attr = {.name = __stringify(_name), .mode = _mode },		\
+	.show	= _show,						\
+	.store	= _store,						\
 }
 
-#define __ATTR_RO(_name) { \
-	.attr	= { .name = __stringify(_name), .mode = 0444 },	\
-	.show	= _name##_show,					\
+#define __ATTR_RO(_name) {						\
+	.attr	= { .name = __stringify(_name), .mode = S_IRUGO },	\
+	.show	= _name##_show,						\
 }
 
-#define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store)
+#define __ATTR_RW(_name) __ATTR(_name, (S_IWUSR | S_IRUGO),		\
+			 _name##_show, _name##_store)
 
 #define __ATTR_NULL { .attr = { .name = NULL } }
 
-- 
cgit v1.2.3


From c0d15cc7ee8c0d1970197d9eb1727503bcdd2471 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Tue, 16 Jul 2013 22:44:08 -0400
Subject: linked-list: Remove __list_for_each

__list_for_each used to be the non prefetch() aware list walking
primitive.  When we removed the prefetch macros from the list routines,
it became redundant.  Given it does exactly the same thing as
list_for_each now, we might as well remove it and call list_for_each
directly.

All users of __list_for_each have been converted to list_for_each calls
in the current merge window.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/list.h | 11 -----------
 1 file changed, 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index b83e5657365a..f4d8a2f12a33 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -380,17 +380,6 @@ static inline void list_splice_tail_init(struct list_head *list,
 #define list_for_each(pos, head) \
 	for (pos = (head)->next; pos != (head); pos = pos->next)
 
-/**
- * __list_for_each	-	iterate over a list
- * @pos:	the &struct list_head to use as a loop cursor.
- * @head:	the head for your list.
- *
- * This variant doesn't differ from list_for_each() any more.
- * We don't do prefetching in either case.
- */
-#define __list_for_each(pos, head) \
-	for (pos = (head)->next; pos != (head); pos = pos->next)
-
 /**
  * list_for_each_prev	-	iterate over a list backwards
  * @pos:	the &struct list_head to use as a loop cursor.
-- 
cgit v1.2.3


From 8c5bd7adb2ce47e6aa39d17b2375f69b0c0aa255 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Thu, 18 Jul 2013 02:08:06 +0200
Subject: ACPI / video / i915: No ACPI backlight if firmware expects Windows 8

According to Matthew Garrett, "Windows 8 leaves backlight control up
to individual graphics drivers rather than making ACPI calls itself.
There's plenty of evidence to suggest that the Intel driver for
Windows [8] doesn't use the ACPI interface, including the fact that
it's broken on a bunch of machines when the OS claims to support
Windows 8.  The simplest thing to do appears to be to disable the
ACPI backlight interface on these systems".

There's a problem with that approach, however, because simply
avoiding to register the ACPI backlight interface if the firmware
calls _OSI for Windows 8 may not work in the following situations:
 (1) The ACPI backlight interface actually works on the given system
     and the i915 driver is not loaded (e.g. another graphics driver
     is used).
 (2) The ACPI backlight interface doesn't work on the given system,
     but there is a vendor platform driver that will register its
     own, equally broken, backlight interface if not prevented from
     doing so by the ACPI subsystem.
Therefore we need to allow the ACPI backlight interface to be
registered until the i915 driver is loaded which then will unregister
it if the firmware has called _OSI for Windows 8 (or will register
the ACPI video driver without backlight support if not already
present).

For this reason, introduce an alternative function for registering
ACPI video, acpi_video_register_with_quirks(), that will check
whether or not the ACPI video driver has already been registered
and whether or not the backlight Windows 8 quirk has to be applied.
If the quirk has to be applied, it will block the ACPI backlight
support and either unregister the backlight interface if the ACPI
video driver has already been registered, or register the ACPI
video driver without the backlight interface otherwise.  Make
the i915 driver use acpi_video_register_with_quirks() instead of
acpi_video_register() in i915_driver_load().

This change is based on earlier patches from Matthew Garrett,
Chun-Yi Lee and Seth Forshee and includes a fix from Aaron Lu's.

References: https://bugzilla.kernel.org/show_bug.cgi?id=51231
Tested-by: Aaron Lu <aaron.lu@intel.com>
Tested-by: Igor Gnatenko <i.gnatenko.brain@gmail.com>
Tested-by: Yves-Alexis Perez <corsac@debian.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Reviewed-by: Aaron Lu <aaron.lu@intel.com>
Acked-by: Matthew Garrett <matthew.garrett@nebula.com>
---
 drivers/acpi/internal.h         | 11 +++++++
 drivers/acpi/video.c            | 69 ++++++++++++++++++++++++++++++++++++-----
 drivers/acpi/video_detect.c     | 21 +++++++++++++
 drivers/gpu/drm/i915/i915_dma.c |  2 +-
 include/acpi/video.h            | 11 ++++++-
 include/linux/acpi.h            |  1 +
 6 files changed, 105 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 3a50a34fe176..227aca77ee1e 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -164,4 +164,15 @@ struct platform_device;
 int acpi_create_platform_device(struct acpi_device *adev,
 				const struct acpi_device_id *id);
 
+/*--------------------------------------------------------------------------
+					Video
+  -------------------------------------------------------------------------- */
+#if defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE)
+bool acpi_video_backlight_quirks(void);
+bool acpi_video_verify_backlight_support(void);
+#else
+static inline bool acpi_video_backlight_quirks(void) { return false; }
+static inline bool acpi_video_verify_backlight_support(void) { return false; }
+#endif
+
 #endif /* _ACPI_INTERNAL_H_ */
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index f236e172d948..e9d4bb60c35c 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -44,6 +44,8 @@
 #include <linux/suspend.h>
 #include <acpi/video.h>
 
+#include "internal.h"
+
 #define PREFIX "ACPI: "
 
 #define ACPI_VIDEO_BUS_NAME		"Video Bus"
@@ -901,7 +903,7 @@ static void acpi_video_device_find_cap(struct acpi_video_device *device)
 	if (acpi_video_init_brightness(device))
 		return;
 
-	if (acpi_video_backlight_support()) {
+	if (acpi_video_verify_backlight_support()) {
 		struct backlight_properties props;
 		struct pci_dev *pdev;
 		acpi_handle acpi_parent;
@@ -1356,8 +1358,8 @@ acpi_video_switch_brightness(struct acpi_video_device *device, int event)
 	unsigned long long level_current, level_next;
 	int result = -EINVAL;
 
-	/* no warning message if acpi_backlight=vendor is used */
-	if (!acpi_video_backlight_support())
+	/* no warning message if acpi_backlight=vendor or a quirk is used */
+	if (!acpi_video_verify_backlight_support())
 		return 0;
 
 	if (!device->brightness)
@@ -1859,6 +1861,46 @@ static int acpi_video_bus_remove(struct acpi_device *device)
 	return 0;
 }
 
+static acpi_status video_unregister_backlight(acpi_handle handle, u32 lvl,
+					      void *context, void **rv)
+{
+	struct acpi_device *acpi_dev;
+	struct acpi_video_bus *video;
+	struct acpi_video_device *dev, *next;
+
+	if (acpi_bus_get_device(handle, &acpi_dev))
+		return AE_OK;
+
+	if (acpi_match_device_ids(acpi_dev, video_device_ids))
+		return AE_OK;
+
+	video = acpi_driver_data(acpi_dev);
+	if (!video)
+		return AE_OK;
+
+	acpi_video_bus_stop_devices(video);
+	mutex_lock(&video->device_list_lock);
+	list_for_each_entry_safe(dev, next, &video->video_device_list, entry) {
+		if (dev->backlight) {
+			backlight_device_unregister(dev->backlight);
+			dev->backlight = NULL;
+			kfree(dev->brightness->levels);
+			kfree(dev->brightness);
+		}
+		if (dev->cooling_dev) {
+			sysfs_remove_link(&dev->dev->dev.kobj,
+					  "thermal_cooling");
+			sysfs_remove_link(&dev->cooling_dev->device.kobj,
+					  "device");
+			thermal_cooling_device_unregister(dev->cooling_dev);
+			dev->cooling_dev = NULL;
+		}
+	}
+	mutex_unlock(&video->device_list_lock);
+	acpi_video_bus_start_devices(video);
+	return AE_OK;
+}
+
 static int __init is_i740(struct pci_dev *dev)
 {
 	if (dev->device == 0x00D1)
@@ -1890,14 +1932,25 @@ static int __init intel_opregion_present(void)
 	return opregion;
 }
 
-int acpi_video_register(void)
+int __acpi_video_register(bool backlight_quirks)
 {
-	int result = 0;
+	bool no_backlight;
+	int result;
+
+	no_backlight = backlight_quirks ? acpi_video_backlight_quirks() : false;
+
 	if (register_count) {
 		/*
-		 * if the function of acpi_video_register is already called,
-		 * don't register the acpi_vide_bus again and return no error.
+		 * If acpi_video_register() has been called already, don't try
+		 * to register acpi_video_bus, but unregister backlight devices
+		 * if no backlight support is requested.
 		 */
+		if (no_backlight)
+			acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT,
+					    ACPI_UINT32_MAX,
+					    video_unregister_backlight,
+					    NULL, NULL, NULL);
+
 		return 0;
 	}
 
@@ -1913,7 +1966,7 @@ int acpi_video_register(void)
 
 	return 0;
 }
-EXPORT_SYMBOL(acpi_video_register);
+EXPORT_SYMBOL(__acpi_video_register);
 
 void acpi_video_unregister(void)
 {
diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index e6bd910bc6ed..826e52def080 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -38,6 +38,8 @@
 #include <linux/dmi.h>
 #include <linux/pci.h>
 
+#include "internal.h"
+
 #define PREFIX "ACPI: "
 
 ACPI_MODULE_NAME("video");
@@ -234,6 +236,17 @@ static void acpi_video_caps_check(void)
 		acpi_video_get_capabilities(NULL);
 }
 
+bool acpi_video_backlight_quirks(void)
+{
+	if (acpi_gbl_osi_data >= ACPI_OSI_WIN_8) {
+		acpi_video_caps_check();
+		acpi_video_support |= ACPI_VIDEO_SKIP_BACKLIGHT;
+		return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL(acpi_video_backlight_quirks);
+
 /* Promote the vendor interface instead of the generic video module.
  * This function allow DMI blacklists to be implemented by externals
  * platform drivers instead of putting a big blacklist in video_detect.c
@@ -278,6 +291,14 @@ int acpi_video_backlight_support(void)
 }
 EXPORT_SYMBOL(acpi_video_backlight_support);
 
+/* For the ACPI video driver use only. */
+bool acpi_video_verify_backlight_support(void)
+{
+	return (acpi_video_support & ACPI_VIDEO_SKIP_BACKLIGHT) ?
+		false : acpi_video_backlight_support();
+}
+EXPORT_SYMBOL(acpi_video_verify_backlight_support);
+
 /*
  * Use acpi_backlight=vendor/video to force that backlight switching
  * is processed by vendor specific acpi drivers or video.ko driver.
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index adb319b53ecd..cf188ab7051a 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1648,7 +1648,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
 	if (INTEL_INFO(dev)->num_pipes) {
 		/* Must be done after probing outputs */
 		intel_opregion_init(dev);
-		acpi_video_register();
+		acpi_video_register_with_quirks();
 	}
 
 	if (IS_GEN5(dev))
diff --git a/include/acpi/video.h b/include/acpi/video.h
index 61109f2609fc..b26dc4fb7ba8 100644
--- a/include/acpi/video.h
+++ b/include/acpi/video.h
@@ -17,12 +17,21 @@ struct acpi_device;
 #define ACPI_VIDEO_DISPLAY_LEGACY_TV      0x0200
 
 #if (defined CONFIG_ACPI_VIDEO || defined CONFIG_ACPI_VIDEO_MODULE)
-extern int acpi_video_register(void);
+extern int __acpi_video_register(bool backlight_quirks);
+static inline int acpi_video_register(void)
+{
+	return __acpi_video_register(false);
+}
+static inline int acpi_video_register_with_quirks(void)
+{
+	return __acpi_video_register(true);
+}
 extern void acpi_video_unregister(void);
 extern int acpi_video_get_edid(struct acpi_device *device, int type,
 			       int device_id, void **edid);
 #else
 static inline int acpi_video_register(void) { return 0; }
+static inline int acpi_video_register_with_quirks(void) { return 0; }
 static inline void acpi_video_unregister(void) { return; }
 static inline int acpi_video_get_edid(struct acpi_device *device, int type,
 				      int device_id, void **edid)
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 353ba256f368..6ad72f92469c 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -191,6 +191,7 @@ extern bool wmi_has_guid(const char *guid);
 #define ACPI_VIDEO_BACKLIGHT_DMI_VIDEO			0x0200
 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VENDOR		0x0400
 #define ACPI_VIDEO_OUTPUT_SWITCHING_DMI_VIDEO		0x0800
+#define ACPI_VIDEO_SKIP_BACKLIGHT			0x1000
 
 #if defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE)
 
-- 
cgit v1.2.3


From d4b812dea4a236f729526facf97df1a9d18e191c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 18 Jul 2013 07:19:26 -0700
Subject: vlan: mask vlan prio bits

In commit 48cc32d38a52d0b68f91a171a8d00531edc6a46e
("vlan: don't deliver frames for unknown vlans to protocols")
Florian made sure we set pkt_type to PACKET_OTHERHOST
if the vlan id is set and we could find a vlan device for this
particular id.

But we also have a problem if prio bits are set.

Steinar reported an issue on a router receiving IPv6 frames with a
vlan tag of 4000 (id 0, prio 2), and tunneled into a sit device,
because skb->vlan_tci is set.

Forwarded frame is completely corrupted : We can see (8100:4000)
being inserted in the middle of IPv6 source address :

16:48:00.780413 IP6 2001:16d8:8100:4000:ee1c:0:9d9:bc87 >
9f94:4d95:2001:67c:29f4::: ICMP6, unknown icmp6 type (0), length 64
       0x0000:  0000 0029 8000 c7c3 7103 0001 a0ae e651
       0x0010:  0000 0000 ccce 0b00 0000 0000 1011 1213
       0x0020:  1415 1617 1819 1a1b 1c1d 1e1f 2021 2223
       0x0030:  2425 2627 2829 2a2b 2c2d 2e2f 3031 3233

It seems we are not really ready to properly cope with this right now.

We can probably do better in future kernels :
vlan_get_ingress_priority() should be a netdev property instead of
a per vlan_dev one.

For stable kernels, lets clear vlan_tci to fix the bugs.

Reported-by: Steinar H. Gunderson <sesse@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h |  3 +--
 net/8021q/vlan_core.c   |  2 +-
 net/core/dev.c          | 11 +++++++++--
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index cdcbafa9b39a..715c343f7c00 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -79,9 +79,8 @@ static inline int is_vlan_dev(struct net_device *dev)
 }
 
 #define vlan_tx_tag_present(__skb)	((__skb)->vlan_tci & VLAN_TAG_PRESENT)
-#define vlan_tx_nonzero_tag_present(__skb) \
-	(vlan_tx_tag_present(__skb) && ((__skb)->vlan_tci & VLAN_VID_MASK))
 #define vlan_tx_tag_get(__skb)		((__skb)->vlan_tci & ~VLAN_TAG_PRESENT)
+#define vlan_tx_tag_get_id(__skb)	((__skb)->vlan_tci & VLAN_VID_MASK)
 
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
 
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 8a15eaadc4bd..4a78c4de9f20 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -9,7 +9,7 @@ bool vlan_do_receive(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	__be16 vlan_proto = skb->vlan_proto;
-	u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK;
+	u16 vlan_id = vlan_tx_tag_get_id(skb);
 	struct net_device *vlan_dev;
 	struct vlan_pcpu_stats *rx_stats;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index a3d8d44cb7f4..26755dd40daa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3580,8 +3580,15 @@ ncls:
 		}
 	}
 
-	if (vlan_tx_nonzero_tag_present(skb))
-		skb->pkt_type = PACKET_OTHERHOST;
+	if (unlikely(vlan_tx_tag_present(skb))) {
+		if (vlan_tx_tag_get_id(skb))
+			skb->pkt_type = PACKET_OTHERHOST;
+		/* Note: we might in the future use prio bits
+		 * and set skb->priority like in vlan_do_receive()
+		 * For the time being, just ignore Priority Code Point
+		 */
+		skb->vlan_tci = 0;
+	}
 
 	/* deliver only exact match when indicated */
 	null_or_dev = deliver_exact ? skb->dev : NULL;
-- 
cgit v1.2.3


From 24924a20dab603089011f9d3eb7622f0f6ef93c0 Mon Sep 17 00:00:00 2001
From: Peng Tao <bergwolf@gmail.com>
Date: Thu, 18 Jul 2013 22:09:08 +0800
Subject: vfs: constify dentry parameter in d_count()

so that it can be used in places like d_compare/d_hash
without causing a compiler warning.

Signed-off-by: Peng Tao <tao.peng@emc.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/dcache.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 3092df3614ae..b90337c9d468 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -324,7 +324,7 @@ static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq)
 	return ret;
 }
 
-static inline unsigned d_count(struct dentry *dentry)
+static inline unsigned d_count(const struct dentry *dentry)
 {
 	return dentry->d_count;
 }
-- 
cgit v1.2.3